python爬虫入门,几个常用方法

mac2024-11-11  12

将互联网上的东西下载到本地 import urllib.request #urlretrieve urllib.request.urlretrieve("https://www.baidu.com","C:/Users/10167/Desktop/address.html") 清除缓存用 #urlcleanup, urllib.request.urlcleanup() 爬取的网页的简介信息 #info, data = urllib.request.urlopen("https://blog.csdn.net/qq_40666620/article/details/102834104") print(data.info()) 状态码,就可以找失效的连接什么的 #getcode: print(data.getcode()) 获取当前爬取的url地址 #geturl: print(data.geturl()) timeout超时设置 for i in range(0,100): try: data = urllib.request.urlopen("https://blog.csdn.net/qq_40666620/article/details/102834104" ,timeout=0.1).read() print("success") except Exception as error: print(error) 自动模拟http请求 import re #post,get #get: keyword = "python" keyword = urllib.request.quote(keyword) url="http://www.baidu.com/s?wd="+keyword target = 'title":"(.*?)"' #print(data) for pn in range(0,10): #9*pn是因为现在百度一页是9条信息,pn已经不是页数了 data = urllib.request.urlopen(url+"&pn="+str(9*pn)).read().decode("utf-8") result = re.compile(target).findall(data) for i in range(0,len(result)): print(result[i])
最新回复(0)