from urllib import request
import time
import urllib
import urllib.parse
#根据url发送请求,获取服务器响应文件
def loadPage(url,filename):
print('正在下载' + filename)
headers = {
'User - Agent': 'Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 65.0.3325.181 Safari / 537.36'
}
req = urllib.request.Request(url,headers=headers)
return urllib.request.urlopen(req).read()
#将html内容写入本地
def writePage(html,filename):
print('正在保存' + filename)
with open(filename,'wb') as f:
f.write(html)
print('-------------------------------')
# http://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5 第一页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 按照规律这个和上面的第一页url是一样的
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 第二页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 第三页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 第四页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200 第五页
#处理每个页面的url
def tiebaSpider(url,beginPage,endPage):
for page in range(beginPage,endPage+1):
pn = (page - 1) * 50
filename='d:/yemian/第'+str(page) + '页.html'
fullurl = url + '&pn-' + str(pn)
html = loadPage(fullurl,filename)
writePage(html,filename)
if __name__ == '__main__':
kw = input('请输入需要爬取页面的贴吧名:')
beginPage = int(input('请输入起始页:'))
endPage = int(input('请输入结束页:'))
url = 'http://tieba.baidu.com/f?'
key = urllib.parse.urlencode({'kw':kw})
fullurl = url + key
tiebaSpider(fullurl,beginPage,endPage)
print('谢谢使用')
time.sleep(10)
转载于:https://www.cnblogs.com/wshr210/p/11305159.html