虽然显示失败了,但是先把代码放在这里。
import requests import re import time # import io # import sys # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8') exist_url=[]#存放已爬取的网页 g_writecount=0 def scrappy(url,depth=1): global g_writecount try: hd={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'} url='https://en.wikipedia.org/wiki/'+url r=requests.get(url,verify=False) r.encoding=r.apparent_encoding html=r.text except Exception as e: print('failed ',url) print(e) return None exist_url.append(url) link_list=re.findall('<a target="_blank" href="/wiki/([^:#=<>]*?)".*?</a>',html) print(html) print(link_list) #去掉已爬取的链接和重复链接 unique_list=list(set(link_list)-set(exist_url)) #把所有链接写到txt文件 for each in unique_list: g_writecount+=1 output='No.'+str(g_writecount)+'\t Depth:'+str(depth)+'\t'+url+'->'+each+'\n' print(output) f=open('link_10_30.txt',"a+") f.write(output) #获取2层 if depth<2: scrappy(each,depth+1) scrappy('Wikipedia')运行结果:
failed https://en.wikipedia.org/wiki/Wikipedia HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Wikipedia (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')")))