自己爬取过的爬虫案例大全(持续更新中...)

mac2026-04-01  7

文章目录

案例1:爬取百度产品列表案例2:爬取新浪新闻指定搜索内容案例3:爬取百度贴吧前十页(get请求)案例4:爬取百度翻译接口案例5:爬取有道翻译接口案例6:登录人人网(cookie)案例7:登录人人网(session)案例8:爬取猫眼电影(正则表达式)案例9:爬取股吧(正则表达式)案例10:爬取某药品网站(正则表达式)案例11:使用xpath爬取扇贝英语单词(xpath)案例12:爬取网易云音乐的所有歌手名字(xpath)案例13:爬取酷狗音乐的歌手和歌单(xpath)案例14:爬取扇贝读书图书信息(selenium+Phantomjs)案例15:爬取腾讯招聘的招聘信息(selenium+Phantomjs)案例16:爬取腾讯招聘(ajax版+多线程版)案例17:爬取英雄联盟所有英雄名字和技能(selenium+phantomjs+ajax接口)案例18:爬取豆瓣电影(requests+多线程)案例19:爬取瓜子二手车的所有车(requests)案例20:爬取链家网北京每个区域的所有房子(selenium+Phantomjs+多线程)案例21:爬取笔趣阁的所有小说(requests)案例22:爬取菜鸟教程的python100例案例23:爬取新浪微博头条前20页(ajax+mysql)案例24:爬取搜狗指定图片(requests+多线程)案例25:爬取链家网北京所有房子(requests+多线程)

案例1:爬取百度产品列表

# ------------------------------------------------1.导包 import requests # -------------------------------------------------2.确定url base_url = 'https://www.baidu.com/more/' # ----------------------------------------------3.发送请求,获取响应 response = requests.get(base_url) # -----------------------------------------------4.查看页面内容,可能出现 乱码 # print(response.text) # print(response.encoding) # ---------------------------------------------------5.解决乱码 # ---------------------------方法一:转换成utf-8格式 # response.encoding='utf-8' # print(response.text) # -------------------------------方法二:解码为utf-8 with open('index.html', 'w', encoding='utf-8') as fp: fp.write(response.content.decode('utf-8')) print(response.status_code) print(response.headers) print(type(response.text)) print(type(response.content))

案例2:爬取新浪新闻指定搜索内容

import requests # ------------------爬取带参数的get请求-------------------爬取新浪新闻,指定的内容 # 1.寻找基础url base_url = 'https://search.sina.com.cn/?' # 2.设置headers字典和params字典,再发请求 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', } key = '孙悟空' # 搜索内容 params = { 'q': key, 'c': 'news', 'from': 'channel', 'ie': 'utf-8', } response = requests.get(base_url, headers=headers, params=params) with open('sina_news.html', 'w', encoding='gbk') as fp: fp.write(response.content.decode('gbk'))

分页类型

第一步:找出分页参数的规律第二步:headers和params字典第三步:用for循环

案例3:爬取百度贴吧前十页(get请求)

# _--------------------爬取百度贴吧搜索某个贴吧的前十页 import requests, os base_url = 'https://tieba.baidu.com/f?' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', } dirname = './tieba/woman/' if not os.path.exists(dirname): os.makedirs(dirname) for i in range(0, 10): params = { 'ie': 'utf-8', 'kw': '美女', 'pn': str(i * 50) } response = requests.get(base_url, headers=headers, params=params) with open(dirname + '美女第%s页.html' % (i+1), 'w', encoding='utf-8') as file: file.write(response.content.decode('utf-8'))

案例4:爬取百度翻译接口

python import requests base_url = 'https://fanyi.baidu.com/sug' kw = input('请输入要翻译的英文单词:') data = { 'kw': kw } headers = { # 由于百度翻译没有反扒措施,因此可以不写请求头 'content-length': str(len(data)), 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'referer': 'https://fanyi.baidu.com/', 'x-requested-with': 'XMLHttpRequest' } response = requests.post(base_url, headers=headers, data=data) # print(response.json()) #结果:{'errno': 0, 'data': [{'k': 'python', 'v': 'n. 蟒; 蚺蛇;'}, {'k': 'pythons', 'v': 'n. 蟒; 蚺蛇; python的复数;'}]} #-----------------------------把他变成一行一行 result='' for i in response.json()['data']: result+=i['v']+'\n' print(kw+'的翻译结果为:') print(result)


案例5:爬取有道翻译接口

import requests base_url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' data = { 'i': 'spider', 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '15722497498890', 'sign': 'a5bfb7f00ee1906773bda3074ff32fec', 'ts': '1572249749889', 'bv': '1b6a302b48b06158238e3c036feb6ba1', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTlME', } headers= { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '239', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': '_ntes_nnid=106c3a7170510674c7f7d772e62a558b,1565682306312; OUTFOX_SEARCH_USER_ID_NCOO=1135450303.6725993; OUTFOX_SEARCH_USER_ID="-1737701989@10.168.11.11"; P_INFO=str_wjp@126.com|1570794528|0|other|00&99|not_found&1570667109&mail_client#bej&null#10#0#0|152885&0||str_wjp@126.com; _ga=GA1.2.1944828316.1572140505; JSESSIONID=aaa-Ya9um-M_N80M5xr4w; ___rl__test__cookies=1572249749875', 'Host': 'fanyi.youdao.com', 'Origin': 'http://fanyi.youdao.com', 'Referer': 'http://fanyi.youdao.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } response = requests.post(base_url,headers = headers,data=data) print(response.text)

爬取完后,我们发现,我们无法修改关键词,只可以查询词汇为spider的词语,因此,我们需要找到办法可以使查出所有词汇。一般这种情况,由于某些参数的随时变化,我们需要在有道上翻译多个词来对比参数有哪些不同

知道是这三个参数不一样,因此我们需要破解这三个参数,常见的参数位置:

js代码中前端页面(可能是隐藏的hidden标签)ajax处

经过查询此处为某个js代码中的参数

所以下一步,我们需要把这几个参数用python求出来

完整代码:

import requests, time, random, hashlib base_url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' value='world'#搜索单词 data = { 'i': value, 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '15722497498890', 'sign': 'a5bfb7f00ee1906773bda3074ff32fec', 'ts': '1572249749889', 'bv': '1b6a302b48b06158238e3c036feb6ba1', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTlME', } headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '239', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': '_ntes_nnid=106c3a7170510674c7f7d772e62a558b,1565682306312; OUTFOX_SEARCH_USER_ID_NCOO=1135450303.6725993; OUTFOX_SEARCH_USER_ID="-1737701989@10.168.11.11"; P_INFO=str_wjp@126.com|1570794528|0|other|00&99|not_found&1570667109&mail_client#bej&null#10#0#0|152885&0||str_wjp@126.com; _ga=GA1.2.1944828316.1572140505; JSESSIONID=aaa-Ya9um-M_N80M5xr4w; ___rl__test__cookies=1572249749875', 'Host': 'fanyi.youdao.com', 'Origin': 'http://fanyi.youdao.com', 'Referer': 'http://fanyi.youdao.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } # ---------------------------------------js代码中 # ts="" + (new Date).getTime() # salt=r + parseInt(10 * Math.random(), 10) # sign=n.md5("fanyideskweb" + e + i + "n%A-rKaT5fb[Gy?;N5@Tj") # ------------------------------转化为python代码 def get_md5(value): md5 = hashlib.md5() md5.update(bytes(value, encoding='utf-8')) return md5.hexdigest() ts = str(int(time.time() * 1000)) salt = ts + str(random.randint(0, 10)) sign = get_md5("fanyideskweb" + value + salt + 'n%A-rKaT5fb[Gy?;N5@Tj') response = requests.post(base_url, headers=headers, data=data) print(response.text)

案例6:登录人人网(cookie)

import requests base_url = 'http://www.renren.com/909063513' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Cookie': 'cookie', } response=requests.get(base_url,headers=headers) if '死性不改' in response.text: print('登录成功') else: print('登录失败')

由于我们登录进入人人网在人人网html页面就会显示用户名,因此可以通过用户名是否存在来判断是否登录成功


案例7:登录人人网(session)

import requests base_url = 'http://www.renren.com/PLogin.do' headers= { 'Host': 'www.renren.com', 'Referer': 'http://safe.renren.com/security/account', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', } data = { 'email':邮箱, 'password':密码, } #创建一个session对象 se = requests.session() #用session对象来发送post请求进行登录。 se.post(base_url,headers=headers,data=data) response = se.get('http://www.renren.com/971682585') if '鸣人' in response.text: print('登录成功!') else: print(response.text) print('登录失败!')

案例8:爬取猫眼电影(正则表达式)

爬取目标:爬取前一百个电影的信息

import re, requests, json class Maoyan: def __init__(self, url): self.url = url self.movie_list = [] self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } self.parse() def parse(self): # 爬去页面的代码 # 1.发送请求,获取响应 # 分页 for i in range(10): url = self.url + '?offset={}'.format(i * 10) response = requests.get(url, headers=self.headers) ''' 1.电影名称 2、主演 3、上映时间 4、评分 ''' # 用正则筛选数据,有个原则:不断缩小筛选范围。 dl_pattern = re.compile(r'<dl class="board-wrapper">(.*?)</dl>', re.S) dl_content = dl_pattern.search(response.text).group() dd_pattern = re.compile(r'<dd>(.*?)</dd>', re.S) dd_list = dd_pattern.findall(dl_content) # print(dd_list) movie_list = [] for dd in dd_list: print(dd) item = {} # ------------电影名字 movie_pattern = re.compile(r'title="(.*?)" class=', re.S) movie_name = movie_pattern.search(dd).group(1) # print(movie_name) actor_pattern = re.compile(r'<p class="star">(.*?)</p>', re.S) actor = actor_pattern.search(dd).group(1).strip() # print(actor) play_time_pattern = re.compile(r'<p class="releasetime">(.*?):(.*?)</p>', re.S) play_time = play_time_pattern.search(dd).group(2).strip() # print(play_time) # 评分 score_pattern_1 = re.compile(r'<i class="integer">(.*?)</i>', re.S) score_pattern_2 = re.compile(r'<i class="fraction">(.*?)</i>', re.S) score = score_pattern_1.search(dd).group(1).strip() + score_pattern_2.search(dd).group(1).strip() # print(score) item['电影名字:'] = movie_name item['主演:'] = actor item['时间:'] = play_time item['评分:'] = score # print(item) self.movie_list.append(item) # 将电影信息保存到json文件中 with open('movie.json', 'w', encoding='utf-8') as fp: json.dump(self.movie_list, fp) if __name__ == '__main__': base_url = 'https://maoyan.com/board/4' Maoyan(base_url) with open('movie.json', 'r') as fp: movie_list = json.load(fp) print(movie_list)

案例9:爬取股吧(正则表达式)

爬取目标: 爬取前十页的阅读数,评论数,标题,作者,更新时间,详情页url

import json import re import requests class GuBa(object): def __init__(self): self.base_url = 'http://guba.eastmoney.com/default,99_%s.html' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } self.infos = [] self.parse() def parse(self): for i in range(1, 13): response = requests.get(self.base_url % i, headers=self.headers) '''阅读数,评论数,标题,作者,更新时间,详情页url''' ul_pattern = re.compile(r'<ul id="itemSearchList" class="itemSearchList">(.*?)</ul>', re.S) ul_content = ul_pattern.search(response.text) if ul_content: ul_content = ul_content.group() li_pattern = re.compile(r'<li>(.*?)</li>', re.S) li_list = li_pattern.findall(ul_content) # print(li_list) for li in li_list: item = {} reader_pattern = re.compile(r'<cite>(.*?)</cite>', re.S) info_list = reader_pattern.findall(li) # print(info_list) reader_num = '' comment_num = '' if info_list: reader_num = info_list[0].strip() comment_num = info_list[1].strip() print(reader_num, comment_num) title_pattern = re.compile(r'title="(.*?)" class="note">', re.S) title = title_pattern.search(li).group(1) # print(title) author_pattern = re.compile(r'target="_blank"><font>(.*?)</font></a><input type="hidden"', re.S) author = author_pattern.search(li).group(1) # print(author) date_pattern = re.compile(r'<cite class="last">(.*?)</cite>', re.S) date = date_pattern.search(li).group(1) # print(date) detail_pattern = re.compile(r' <a href="(.*?)" title=', re.S) detail_url = detail_pattern.search(li) if detail_url: detail_url = 'http://guba.eastmoney.com' + detail_url.group(1) else: detail_url = '' print(detail_url) item['title'] = title item['author'] = author item['date'] = date item['reader_num'] = reader_num item['comment_num'] = comment_num item['detail_url'] = detail_url self.infos.append(item) with open('guba.json', 'w', encoding='utf-8') as fp: json.dump(self.infos, fp) gb=GuBa()

案例10:爬取某药品网站(正则表达式)

爬取目标:爬取五十页的药品信息

''' 要求:抓取50页 字段:总价,描述,评论数量,详情页链接 用正则爬取。 ''' import requests, re,json class Drugs: def __init__(self): self.url = url = 'https://www.111.com.cn/categories/953710-j%s.html' self.headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } self.Drugs_list=[] self.parse() def parse(self): for i in range(51): response = requests.get(self.url % i, headers=self.headers) # print(response.text) # 字段:药名,总价,评论数量,详情页链接 Drugsul_pattern = re.compile('<ul id="itemSearchList" class="itemSearchList">(.*?)</ul>', re.S) Drugsul = Drugsul_pattern.search(response.text).group() # print(Drugsul) Drugsli_list_pattern = re.compile('<li id="producteg(.*?)</li>', re.S) Drugsli_list = Drugsli_list_pattern.findall(Drugsul) Drugsli_list = Drugsli_list # print(Drugsli_list) for drug in Drugsli_list: # ---药名 item={} name_pattern = re.compile('alt="(.*?)"', re.S) name = name_pattern.search(str(drug)).group(1) # print(name) # ---总价 total_pattern = re.compile('<span>(.*?)</span>', re.S) total = total_pattern.search(drug).group(1).strip() # print(total) # ----评论 comment_pattern = re.compile('<em>(.*?)</em>') comment = comment_pattern.search(drug) if comment: comment_group = comment.group(1) else: comment_group = '0' # print(comment_group) # ---详情页链接 href_pattern = re.compile('" href="//(.*?)"') href='https://'+href_pattern.search(drug).group(1).strip() # print(href) item['药名']=name item['总价']=total item['评论']=comment item['链接']=href self.Drugs_list.append(item) drugs = Drugs() print(drugs.Drugs_list)

案例11:使用xpath爬取扇贝英语单词(xpath)

需求:爬取三页单词

import json import requests from lxml import etree base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=%s' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } def get_text(value): if value: return value[0] return '' word_list = [] for i in range(1, 4): # 发送请求 response = requests.get(base_url % i, headers=headers) # print(response.text) html = etree.HTML(response.text) tr_list = html.xpath('//tbody/tr') # print(tr_list) for tr in tr_list: item = {}#构造单词列表 en = get_text(tr.xpath('.//td[@class="span2"]/strong/text()')) tra = get_text(tr.xpath('.//td[@class="span10"]/text()')) print(en, tra) if en: item[en] = tra word_list.append(item)

面向对象:

import requests from lxml import etree class Shanbei(object): def __init__(self): self.base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=%s' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } self.word_list = [] self.parse() def get_text(self, value): # 防止为空报错 if value: return value[0] return '' def parse(self): for i in range(1, 4): # 发送请求 response = requests.get(self.base_url % i, headers=self.headers) # print(response.text) html = etree.HTML(response.text) tr_list = html.xpath('//tbody/tr') # print(tr_list) for tr in tr_list: item = {} # 构造单词列表 en = self.get_text(tr.xpath('.//td[@class="span2"]/strong/text()')) tra = self.get_text(tr.xpath('.//td[@class="span10"]/text()')) print(en, tra) if en: item[en] = tra self.word_list.append(item) shanbei = Shanbei()

案例12:爬取网易云音乐的所有歌手名字(xpath)

import requests,json from lxml import etree url = 'https://music.163.com/discover/artist' singer_infos = [] # ---------------通过url获取该页面的内容,返回xpath对象 def get_xpath(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } response = requests.get(url, headers=headers) return etree.HTML(response.text) # --------------通过get_xpath爬取到页面后,我们获取华宇,华宇男等分类 def parse(): html = get_xpath(url) fenlei_url_list = html.xpath('//ul[@class="nav f-cb"]/li/a/@href') # 获取华宇等分类的url # print(fenlei_url_list) # --------将热门和推荐两栏去掉筛选 new_list = [i for i in fenlei_url_list if 'id' in i] for i in new_list: fenlei_url = 'https://music.163.com' + i parse_fenlei(fenlei_url) # print(fenlei_url) # -------------通过传入的分类url,获取A,B,C页面内容 def parse_fenlei(url): html = get_xpath(url) # 获得字母排序,每个字母的链接 zimu_url_list = html.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href') for i in zimu_url_list: zimu_url = 'https://music.163.com' + i parse_singer(zimu_url) # ---------------------传入获得的字母链接,开始爬取歌手内容 def parse_singer(url): html = get_xpath(url) item = {} singer_names = html.xpath('//ul[@id="m-artist-box"]/li/p/a/text()') # --详情页看到页面结构会有两个a标签,所以取第一个 singer_href = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/@href') # print(singer_names,singer_href) for i, name in enumerate(singer_names): item['歌手名'] = name item['音乐链接'] = 'https://music.163.com' + singer_href[i].strip() # 获取歌手详情页的链接 url = item['音乐链接'].replace(r'?id', '/desc?id') # print(url) parse_detail(url, item) print(item) # ---------获取详情页url和存着歌手名字和音乐列表的字典,在字典中添加详情页数据 def parse_detail(url, item): html = get_xpath(url) desc_list = html.xpath('//div[@class="n-artdesc"]/p/text()') item['歌手信息'] = desc_list singer_infos.append(item) write_singer(item) # ----------------将数据字典写入歌手文件 def write_singer(item): with open('singer.json', 'a+', encoding='utf-8') as file: json.dump(item,file) if __name__ == '__main__': parse()

面向对象

import json, requests from lxml import etree class Wangyiyun(object): def __init__(self): self.url = 'https://music.163.com/discover/artist' self.singer_infos = [] self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } self.parse() # ---------------通过url获取该页面的内容,返回xpath对象 def get_xpath(self, url): response = requests.get(url, headers=self.headers) return etree.HTML(response.text) # --------------通过get_xpath爬取到页面后,我们获取华宇,华宇男等分类 def parse(self): html = self.get_xpath(self.url) fenlei_url_list = html.xpath('//ul[@class="nav f-cb"]/li/a/@href') # 获取华宇等分类的url # print(fenlei_url_list) # --------将热门和推荐两栏去掉筛选 new_list = [i for i in fenlei_url_list if 'id' in i] for i in new_list: fenlei_url = 'https://music.163.com' + i self.parse_fenlei(fenlei_url) # print(fenlei_url) # -------------通过传入的分类url,获取A,B,C页面内容 def parse_fenlei(self, url): html = self.get_xpath(url) # 获得字母排序,每个字母的链接 zimu_url_list = html.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href') for i in zimu_url_list: zimu_url = 'https://music.163.com' + i self.parse_singer(zimu_url) # ---------------------传入获得的字母链接,开始爬取歌手内容 def parse_singer(self, url): html = self.get_xpath(url) item = {} singer_names = html.xpath('//ul[@id="m-artist-box"]/li/p/a/text()') # --详情页看到页面结构会有两个a标签,所以取第一个 singer_href = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/@href') # print(singer_names,singer_href) for i, name in enumerate(singer_names): item['歌手名'] = name item['音乐链接'] = 'https://music.163.com' + singer_href[i].strip() # 获取歌手详情页的链接 url = item['音乐链接'].replace(r'?id', '/desc?id') # print(url) self.parse_detail(url, item) print(item) # ---------获取详情页url和存着歌手名字和音乐列表的字典,在字典中添加详情页数据 def parse_detail(self, url, item): html = self.get_xpath(url) desc_list = html.xpath('//div[@class="n-artdesc"]/p/text()')[0] item['歌手信息'] = desc_list self.singer_infos.append(item) self.write_singer(item) # ----------------将数据字典写入歌手文件 def write_singer(self, item): with open('sing.json', 'a+', encoding='utf-8') as file: json.dump(item, file) music = Wangyiyun()

案例13:爬取酷狗音乐的歌手和歌单(xpath)

需求:爬取酷狗音乐的歌手和歌单和歌手简介

import json, requests from lxml import etree base_url = 'https://www.kugou.com/yy/singer/index/%s-%s-1.html' # ---------------通过url获取该页面的内容,返回xpath对象 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } # ---------------通过url获取该页面的内容,返回xpath对象 def get_xpath(url, headers): try: response = requests.get(url, headers=headers) return etree.HTML(response.text) except Exception: print(url, '该页面没有相应!') return '' # --------------------通过歌手详情页获取歌手简介 def parse_info(url): html = get_xpath(url, headers) info = html.xpath('//div[@class="intro"]/p/text()') return info # --------------------------写入方法 def write_json(value): with open('kugou.json', 'a+', encoding='utf-8') as file: json.dump(value, file) # -----------------------------用ASCII码值来变换abcd... for j in range(97, 124): # 小写字母为97-122,当等于123的时候我们按歌手名单的其他算,路由为null if j < 123: p = chr(j) else: p = "null" for i in range(1, 6): response = requests.get(base_url % (i, p), headers=headers) # print(response.text) html = etree.HTML(response.text) # 由于数据分两个url,所以需要加起来数据列表 name_list1 = html.xpath('//ul[@id="list_head"]/li/strong/a/text()') sing_list1 = html.xpath('//ul[@id="list_head"]/li/strong/a/@href') name_list2 = html.xpath('//div[@id="list1"]/ul/li/a/text()') sing_list2 = html.xpath('//div[@id="list1"]/ul/li/a/@href') singer_name_list = name_list1 + name_list2 singer_sing_list = sing_list1 + sing_list2 # print(singer_name_list,singer_sing_list) for i, name in enumerate(singer_name_list): item = {} item['名字'] = name item['歌单'] = singer_sing_list[i] # item['歌手信息']=parse_info(singer_sing_list[i])#被封了 write_json(item)

面向对象:

import json, requests from lxml import etree class KuDog(object): def __init__(self): self.base_url = 'https://www.kugou.com/yy/singer/index/%s-%s-1.html' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } self.parse() # ---------------通过url获取该页面的内容,返回xpath对象 def get_xpath(self, url, headers): try: response = requests.get(url, headers=headers) return etree.HTML(response.text) except Exception: print(url, '该页面没有相应!') return '' # --------------------通过歌手详情页获取歌手简介 def parse_info(self, url): html = self.get_xpath(url, self.headers) info = html.xpath('//div[@class="intro"]/p/text()') return info[0] # --------------------------写入方法 def write_json(self, value): with open('kugou.json', 'a+', encoding='utf-8') as file: json.dump(value, file) # -----------------------------用ASCII码值来变换abcd... def parse(self): for j in range(97, 124): # 小写字母为97-122,当等于123的时候我们按歌手名单的其他算,路由为null if j < 123: p = chr(j) else: p = "null" for i in range(1, 6): response = requests.get(self.base_url % (i, p), headers=self.headers) # print(response.text) html = etree.HTML(response.text) # 由于数据分两个url,所以需要加起来数据列表 name_list1 = html.xpath('//ul[@id="list_head"]/li/strong/a/text()') sing_list1 = html.xpath('//ul[@id="list_head"]/li/strong/a/@href') name_list2 = html.xpath('//div[@id="list1"]/ul/li/a/text()') sing_list2 = html.xpath('//div[@id="list1"]/ul/li/a/@href') singer_name_list = name_list1 + name_list2 singer_sing_list = sing_list1 + sing_list2 # print(singer_name_list,singer_sing_list) for i, name in enumerate(singer_name_list): item = {} item['名字'] = name item['歌单'] = singer_sing_list[i] # item['歌手信息']=parse_info(singer_sing_list[i])#被封了 print(item) self.write_json(item) music = KuDog()


案例14:爬取扇贝读书图书信息(selenium+Phantomjs)

由于数据有js方法写入,因此不好在利用requests模块获取,所以使用selenium+Phantomjs获取

import time, json from lxml import etree from selenium import webdriver base_url = 'https://search.douban.com/book/subject_search?search_text=python&cat=1001&start=%s' driver = webdriver.PhantomJS() def get_text(text): if text: return text[0] return '' def parse_page(text): html = etree.HTML(text) div_list = html.xpath('//div[@id="root"]/div/div/div/div/div/div[@class="item-root"]') # print(div_list) for div in div_list: item = {} ''' 图书名称,评分,评价数,详情页链接,作者,出版社,价格,出版日期 ''' name = get_text(div.xpath('.//div[@class="title"]/a/text()')) scores = get_text(div.xpath('.//span[@class="rating_nums"]/text()')) comment_num = get_text(div.xpath('.//span[@class="pl"]/text()')) detail_url = get_text(div.xpath('.//div[@class="title"]/a/@href')) detail = get_text(div.xpath('.//div[@class="meta abstract"]/text()')) if detail: detail_list = detail.split('/') else: detail_list = ['未知', '未知', '未知', '未知'] # print(detail_list) if all([name, detail_url]): # 如果名字和详情链接为true item['书名'] = name item['评分'] = scores item['评论'] = comment_num item['详情链接'] = detail_url item['出版社'] = detail_list[-3] item['价格'] = detail_list[-1] item['出版日期'] = detail_list[-2] author_list = detail_list[:-3] author = '' for aut in author_list: author += aut + ' ' item['作者'] = author print(item) write_singer(item) def write_singer(item): with open('book.json', 'a+', encoding='utf-8') as file: json.dump(item, file) if __name__ == '__main__': for i in range(10): driver.get(base_url % (i * 15)) # 等待 time.sleep(2) html_str = driver.page_source parse_page(html_str)

面向对象:

from lxml import etree from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from urllib import parse class Douban(object): def __init__(self, url): self.url = url self.driver = webdriver.PhantomJS() self.wait = WebDriverWait(self.driver, 10) self.parse() # 判断数据是否存在,不存在返回空字符 def get_text(self, text): if text: return text[0] return '' def get_content_by_selenium(self, url, xpath): self.driver.get(url) # 等待,locator对象是一个元组,此处获取xpath对应的元素并加载出来 webelement = self.wait.until(EC.presence_of_element_located((By.XPATH, xpath))) return self.driver.page_source def parse(self): html_str = self.get_content_by_selenium(self.url, '//div[@id="root"]/div/div/div/div') html = etree.HTML(html_str) div_list = html.xpath('//div[@id="root"]/div/div/div/div/div') for div in div_list: item = {} '''图书名称+评分+评价数+详情页链接+作者+出版社+价格+出版日期''' name = self.get_text(div.xpath('.//div[@class="title"]/a/text()')) scores = self.get_text(div.xpath('.//span[@class="rating_nums"]/text()')) comment_num = self.get_text(div.xpath('.//span[@class="pl"]/text()')) detail_url = self.get_text(div.xpath('.//div[@class="title"]/a/@href')) detail = self.get_text(div.xpath('.//div[@class="meta abstract"]/text()')) if detail: detail_list = detail.split('/') else: detail_list = ['未知', '未知', '未知', '未知'] if all([name, detail_url]): # 如果列表里的数据为true方可执行 item['书名'] = name item['评分'] = scores item['评论'] = comment_num item['详情链接'] = detail_url item['出版社'] = detail_list[-3] item['价格'] = detail_list[-1] item['出版日期'] = detail_list[-2] author_list = detail_list[:-3] author = '' for aut in author_list: author += aut + ' ' item['作者'] = author print(item) if __name__ == '__main__': kw = 'python' base_url = 'https://search.douban.com/book/subject_search?' for i in range(10): params = { 'search_text': kw, 'cat': '1001', 'start': str(i * 15), } url = base_url + parse.urlencode(params) Douban(url)

案例15:爬取腾讯招聘的招聘信息(selenium+Phantomjs)

import time from lxml import etree from selenium import webdriver driver = webdriver.PhantomJS() base_url = 'https://careers.tencent.com/search.html?index=%s' job=[] def getText(text): if text: return text[0] else: return '' def parse(text): html = etree.HTML(text) div_list = html.xpath('//div[@class="correlation-degree"]/div[@class="recruit-wrap recruit-margin"]/div') # print(div_list) for i in div_list: item = {} job_name = i.xpath('a/h4/text()') # ------职位 job_loc = i.xpath('a/p/span[2]/text()') # --------地点 job_gangwei = i.xpath('a/p/span[3]/text()') # -----岗位 job_time = i.xpath('a/p/span[4]/text()') # -----发布时间 item['职位']=job_name item['地点']=job_loc item['岗位']=job_gangwei item['发布时间']=job_time job.append(item) if __name__ == '__main__': for i in range(1, 11): driver.get(base_url % i) text = driver.page_source # print(text) time.sleep(1) parse(text) print(job)

面向对象:

import json from lxml import etree from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from urllib import parse class Tencent(object): def __init__(self,url): self.url = url self.driver = webdriver.PhantomJS() self.wait = WebDriverWait(self.driver,10) self.parse() def get_text(self,text): if text: return text[0] return '' def get_content_by_selenium(self,url,xpath): self.driver.get(url) webelement = self.wait.until(EC.presence_of_element_located((By.XPATH,xpath))) return self.driver.page_source def parse(self): html_str = self.get_content_by_selenium(self.url,'//div[@class="correlation-degree"]') html = etree.HTML(html_str) div_list = html.xpath('//div[@class="recruit-wrap recruit-margin"]/div') # print(div_list) for div in div_list: '''title,工作简介,工作地点,发布时间,岗位类别,详情页链接''' job_name = self.get_text(div.xpath('.//h4[@class="recruit-title"]/text()')) job_loc = self.get_text(div.xpath('.//p[@class="recruit-tips"]/span[2]/text()')) job_gangwei = self.get_text(div.xpath('.//p/span[3]/text()') ) # -----岗位 job_time = self.get_text(div.xpath('.//p/span[4]/text()') ) # -----发布时间 item = {} item['职位'] = job_name item['地点'] = job_loc item['岗位'] = job_gangwei item['发布时间'] = job_time print(item) self.write_(item) def write_(self,item): with open('Tencent_job_100page.json', 'a+', encoding='utf-8') as file: json.dump(item, file) if __name__ == '__main__': base_url = 'https://careers.tencent.com/search.html?index=%s' for i in range(1,100): Tencent(base_url %i)

案例16:爬取腾讯招聘(ajax版+多线程版)

通过分析我们发现,腾讯招聘使用的是ajax的数据接口,因此我们直接去寻找ajax的数据接口链接。

import requests, json class Tencent(object): def __init__(self): self.base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' self.headers = { 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'referer': 'https://careers.tencent.com/search.html' } self.parse() def parse(self): for i in range(1, 3): params = { 'timestamp': '1572850838681', 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '', 'attrId': '', 'keyword': '', 'pageIndex': str(i), 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' } response = requests.get(self.base_url, headers=self.headers, params=params) self.parse_json(response.text) def parse_json(self, text): # 将json字符串编程python内置对象 infos = [] json_dict = json.loads(text) for data in json_dict['Data']['Posts']: RecruitPostName = data['RecruitPostName'] CategoryName = data['CategoryName'] Responsibility = data['Responsibility'] LastUpdateTime = data['LastUpdateTime'] detail_url = data['PostURL'] item = {} item['RecruitPostName'] = RecruitPostName item['CategoryName'] = CategoryName item['Responsibility'] = Responsibility item['LastUpdateTime'] = LastUpdateTime item['detail_url'] = detail_url # print(item) infos.append(item) self.write_to_file(infos) def write_to_file(self, list_): for item in list_: with open('infos.txt', 'a+', encoding='utf-8') as fp: fp.writelines(str(item)) if __name__ == '__main__': t = Tencent()

改为多线程版后

import requests, json, threading class Tencent(object): def __init__(self): self.base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' self.headers = { 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'referer': 'https://careers.tencent.com/search.html' } self.parse() def parse(self): for i in range(1, 3): params = { 'timestamp': '1572850838681', 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '', 'attrId': '', 'keyword': '', 'pageIndex': str(i), 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' } response = requests.get(self.base_url, headers=self.headers, params=params) self.parse_json(response.text) def parse_json(self, text): # 将json字符串编程python内置对象 infos = [] json_dict = json.loads(text) for data in json_dict['Data']['Posts']: RecruitPostName = data['RecruitPostName'] CategoryName = data['CategoryName'] Responsibility = data['Responsibility'] LastUpdateTime = data['LastUpdateTime'] detail_url = data['PostURL'] item = {} item['RecruitPostName'] = RecruitPostName item['CategoryName'] = CategoryName item['Responsibility'] = Responsibility item['LastUpdateTime'] = LastUpdateTime item['detail_url'] = detail_url # print(item) infos.append(item) self.write_to_file(infos) def write_to_file(self, list_): for item in list_: with open('infos.txt', 'a+', encoding='utf-8') as fp: fp.writelines(str(item)) if __name__ == '__main__': tencent = Tencent() t = threading.Thread(target=tencent.parse) t.start()

改成多线程版的线程类:

import requests, json, threading class Tencent(threading.Thread): def __init__(self, i): super().__init__() self.i = i self.base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' self.headers = { 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'referer': 'https://careers.tencent.com/search.html' } def run(self): self.parse() def parse(self): params = { 'timestamp': '1572850838681', 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '', 'attrId': '', 'keyword': '', 'pageIndex': str(self.i), 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' } response = requests.get(self.base_url, headers=self.headers, params=params) self.parse_json(response.text) def parse_json(self, text): # 将json字符串编程python内置对象 infos = [] json_dict = json.loads(text) for data in json_dict['Data']['Posts']: RecruitPostName = data['RecruitPostName'] CategoryName = data['CategoryName'] Responsibility = data['Responsibility'] LastUpdateTime = data['LastUpdateTime'] detail_url = data['PostURL'] item = {} item['RecruitPostName'] = RecruitPostName item['CategoryName'] = CategoryName item['Responsibility'] = Responsibility item['LastUpdateTime'] = LastUpdateTime item['detail_url'] = detail_url # print(item) infos.append(item) self.write_to_file(infos) def write_to_file(self, list_): for item in list_: with open('infos.txt', 'a+', encoding='utf-8') as fp: fp.writelines(str(item) + '\n') if __name__ == '__main__': for i in range(1, 50): t = Tencent(i) t.start()

这样的弊端是如果有多个多线程同时运行,会导致系统的崩溃,因此我们使用队列,控制线程数量

import requests,json,time,threading from queue import Queue class Tencent(threading.Thread): def __init__(self,url,headers,name,q): super().__init__() self.url= url self.name = name self.q = q self.headers = headers def run(self): self.parse() def write_to_file(self,list_): with open('infos1.txt', 'a+', encoding='utf-8') as fp: for item in list_: fp.write(str(item)) def parse_json(self,text): #将json字符串编程python内置对象 infos = [] json_dict = json.loads(text) for data in json_dict['Data']['Posts']: RecruitPostName = data['RecruitPostName'] CategoryName = data['CategoryName'] Responsibility = data['Responsibility'] LastUpdateTime = data['LastUpdateTime'] detail_url = data['PostURL'] item = {} item['RecruitPostName'] = RecruitPostName item['CategoryName'] = CategoryName item['Responsibility'] = Responsibility item['LastUpdateTime'] = LastUpdateTime item['detail_url'] = detail_url # print(item) infos.append(item) self.write_to_file(infos) def parse(self): while True: if self.q.empty(): break page = self.q.get() print(f'==================第{page}页==========================in{self.name}') params = { 'timestamp': '1572850797210', 'countryId':'', 'cityId':'', 'bgIds':'', 'productId':'', 'categoryId':'', 'parentCategoryId':'', 'attrId':'', 'keyword':'', 'pageIndex': str(page), 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' } response = requests.get(self.url,params=params,headers=self.headers) self.parse_json(response.text) if __name__ == '__main__': start = time.time() base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' headers= { 'referer': 'https: // careers.tencent.com / search.html', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin' } #1创建任务队列 q = Queue() #2给队列添加任务,任务是每一页的页码 for page in range(1,50): q.put(page) # print(queue) # while not q.empty(): # print(q.get()) #3.创建一个列表 crawl_list = ['aa','bb','cc','dd','ee'] list_ = [] for name in crawl_list: t = Tencent(base_url,headers,name,q) t.start() list_.append(t) for l in list_: l.join() # 3.4171955585479736 print(time.time()-start)

案例17:爬取英雄联盟所有英雄名字和技能(selenium+phantomjs+ajax接口)

from selenium import webdriver from lxml import etree import requests, json driver = webdriver.PhantomJS() base_url = 'https://lol.qq.com/data/info-heros.shtml' driver.get(base_url) html = etree.HTML(driver.page_source) hero_url_list = html.xpath('.//ul[@id="jSearchHeroDiv"]/li/a/@href') hero_list = [] # 存放所有英雄的列表 for hero_url in hero_url_list: id = hero_url.split('=')[-1] # print(id) detail_url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/' + id + '.js' # print(detail_url) headers = { 'Referer': 'https://lol.qq.com/data/info-defail.shtml?id =4', 'Sec-Fetch-Mode': 'cors', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } response = requests.get(detail_url, headers=headers) n = json.loads(response.text) hero = [] # 存放单个英雄 item_name = {} item_name['英雄名字'] = n['hero']['name'] + ' ' + n['hero']['title'] hero.append(item_name) for i in n['spells']: # 技能 item_skill = {} item_skill['技能名字'] = i['name'] item_skill['技能描述'] = i['description'] hero.append(item_skill) hero_list.append(hero) # print(hero_list) with open('hero.json','w') as file: json.dump(hero_list,file)

案例18:爬取豆瓣电影(requests+多线程)

需求:获得每个分类里的所有电影

import json import re, requests from lxml import etree # 获取网页的源码 def get_content(url, headers): response = requests.get(url, headers=headers) return response.text # 获取电影指定信息 def get_movie_info(text): text = json.loads(text) item = {} for data in text: score = data['score'] image = data['cover_url'] title = data['title'] actors = data['actors'] detail_url = data['url'] vote_count = data['vote_count'] types = data['types'] item['评分'] = score item['图片'] = image item['电影名'] = title item['演员'] = actors item['详情页链接'] = detail_url item['评价数'] = vote_count item['电影类别'] = types print(item) # 获取电影api数据的 def get_movie(type, url): headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', } n = 0 # 获取api数据,并判断分页 while True: text = get_content(url.format(type, n), headers=headers) if text == '[]': break get_movie_info(text) n += 20 # 主方法 def main(): base_url = 'https://movie.douban.com/chart' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Referer': 'https://movie.douban.com/explore' } html_str = get_content(base_url, headers=headers) # 分类页首页 html = etree.HTML(html_str) movie_urls = html.xpath('//div[@class="types"]/span/a/@href') # 获得每个分类的连接,但是切割type for url in movie_urls: p = re.compile('type=(.*?)&interval_id=') type_ = p.search(url).group(1) ajax_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20' get_movie(type_, ajax_url) if __name__ == '__main__': main()

多线程

import json, threading import re, requests from lxml import etree from queue import Queue class DouBan(threading.Thread): def __init__(self, q=None): super().__init__() self.base_url = 'https://movie.douban.com/chart' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Referer': 'https://movie.douban.com/explore' } self.q = q self.ajax_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20' # 获取网页的源码 def get_content(self, url, headers): response = requests.get(url, headers=headers) return response.text # 获取电影指定信息 def get_movie_info(self, text): text = json.loads(text) item = {} for data in text: score = data['score'] image = data['cover_url'] title = data['title'] actors = data['actors'] detail_url = data['url'] vote_count = data['vote_count'] types = data['types'] item['评分'] = score item['图片'] = image item['电影名'] = title item['演员'] = actors item['详情页链接'] = detail_url item['评价数'] = vote_count item['电影类别'] = types print(item) # 获取电影api数据的 def get_movie(self): headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', } # 获取api数据,并判断分页 while True: if self.q.empty(): break n = 0 while True: text = self.get_content(self.ajax_url.format(self.q.get(), n), headers=headers) if text == '[]': break self.get_movie_info(text) n += 20 # 获取所有类型的type——id def get_types(self): html_str = self.get_content(self.base_url, headers=self.headers) # 分类页首页 html = etree.HTML(html_str) types = html.xpath('//div[@class="types"]/span/a/@href') # 获得每个分类的连接,但是切割type # print(types) type_list = [] for i in types: p = re.compile('type=(.*?)&interval_id=') # 筛选id,拼接到api接口的路由 type = p.search(i).group(1) type_list.append(type) return type_list def run(self): self.get_movie() if __name__ == '__main__': # 创建消息队列 q = Queue() # 将任务队列初始化,将我们的type放到消息队列中 t = DouBan() types = t.get_types() for tp in types: q.put(tp[0]) # 创建一个列表,列表的数量就是开启线程的树木 crawl_list = [1, 2, 3, 4] for crawl in crawl_list: # 实例化对象 movie = DouBan(q=q) movie.start()

案例19:爬取瓜子二手车的所有车(requests)

需求:获得每个车类型的所有信息

import json import requests, re from lxml import etree # 获取网页的源码 def get_content(url, headers): response = requests.get(url, headers=headers) return response.text # 获取子页原代码 def get_info(text): item = {} title_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@title') price_list = text.xpath('//div[@class="t-price"]/p/text()') year_list = text.xpath('//div[@class="t-i"]/text()[1]') millon_list = text.xpath('//div[@class="t-i"]/text()[2]') picture_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/img/@src') details_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@href') for i, title in enumerate(title_list): item['标题'] = title item['价格'] = price_list[i] + '万' item['公里数'] = millon_list[i] item['年份'] = year_list[i] item['照片链接'] = picture_list[i] item['详情页链接'] = 'https://www.guazi.com' + details_list[i] print(item) # 主函数 def main(): base_url = 'https://www.guazi.com/bj/buy/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Cookie': 'track_id=7534369675321344; uuid=c129325e-6fea-4fd0-dea5-3632997e0419; antipas=wL2L859nHt69349594j71850u61; cityDomain=bj; clueSourceCode=10103000312%2300; user_city_id=12; ganji_uuid=6616956591030214317551; sessionid=5f3261c7-27a6-4bd6-e909-f70312d46c39; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%227534369675321344%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22c129325e-6fea-4fd0-dea5-3632997e0419%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%225f3261c7-27a6-4bd6-e909-f70312d46c39%22%7D; preTime=%7B%22last%22%3A1572951901%2C%22this%22%3A1572951534%2C%22pre%22%3A1572951534%7D', } html = etree.HTML(get_content(base_url, headers)) brand_url_list = html.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]/ul/li/p/a/@href') for url in brand_url_list: headers = { 'Referer': 'https://www.guazi.com/bj/buy/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Cookie': 'track_id=7534369675321344; uuid=c129325e-6fea-4fd0-dea5-3632997e0419; antipas=wL2L859nHt69349594j71850u61; cityDomain=bj; clueSourceCode=10103000312%2300; user_city_id=12; ganji_uuid=6616956591030214317551; sessionid=5f3261c7-27a6-4bd6-e909-f70312d46c39; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%227534369675321344%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22c129325e-6fea-4fd0-dea5-3632997e0419%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%225f3261c7-27a6-4bd6-e909-f70312d46c39%22%7D; preTime=%7B%22last%22%3A1572953403%2C%22this%22%3A1572951534%2C%22pre%22%3A1572951534%7D', } brand_url = 'https://www.guazi.com' + url.split('/#')[0] + '/o%s/#bread' # 拼接每个品牌汽车的url for i in range(1, 3): html = etree.HTML(get_content(brand_url % i, headers=headers)) get_info(html) if __name__ == '__main__': main()

多线程:

import requests, threading from lxml import etree from queue import Queue class Guazi(threading.Thread): def __init__(self, list_=None): super().__init__() self.base_url = 'https://www.guazi.com/bj/buy/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Cookie': 'track_id=7534369675321344; uuid=c129325e-6fea-4fd0-dea5-3632997e0419; antipas=wL2L859nHt69349594j71850u61; cityDomain=bj; clueSourceCode=10103000312%2300; user_city_id=12; ganji_uuid=6616956591030214317551; sessionid=5f3261c7-27a6-4bd6-e909-f70312d46c39; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%227534369675321344%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22c129325e-6fea-4fd0-dea5-3632997e0419%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%225f3261c7-27a6-4bd6-e909-f70312d46c39%22%7D; preTime=%7B%22last%22%3A1572951901%2C%22this%22%3A1572951534%2C%22pre%22%3A1572951534%7D', } self.list_ = list_ # 获取网页的源码 def get_content(self, url, headers): response = requests.get(url, headers=headers) return response.text # 获取子页原代码 def get_info(self, text): item = {} title_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@title') price_list = text.xpath('//div[@class="t-price"]/p/text()') year_list = text.xpath('//div[@class="t-i"]/text()[1]') millon_list = text.xpath('//div[@class="t-i"]/text()[2]') picture_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/img/@src') details_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@href') for i, title in enumerate(title_list): item['标题'] = title item['价格'] = price_list[i] + '万' item['公里数'] = millon_list[i] item['年份'] = year_list[i] item['照片链接'] = picture_list[i] item['详情页链接'] = 'https://www.guazi.com' + details_list[i] print(item) # 获取汽车链接列表 def get_carsurl(self): html = etree.HTML(self.get_content(self.base_url, self.headers)) brand_url_list = html.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]/ul/li/p/a/@href') brand_url_list = ['https://www.guazi.com' + url.split('/#')[0] + '/o%s/#bread' for url in brand_url_list] return brand_url_list def run(self): while True: if self.list_.empty(): break url = self.list_.get() for i in range(1, 3): html = etree.HTML(self.get_content(url % i, headers=self.headers)) self.get_info(html) if __name__ == '__main__': q = Queue() gz = Guazi() cars_url = gz.get_carsurl() for car in cars_url: q.put(car) # 创建一个列表,列表的数量就是开启线程的树木 crawl_list = [1, 2, 3, 4] for crawl in crawl_list: # 实例化对象 car = Guazi(list_=q) car.start()

结果:


案例20:爬取链家网北京每个区域的所有房子(selenium+Phantomjs+多线程)

#爬取链家二手房信息。 # 要求: # 1.爬取的字段: # 名称,房间规模、价格,建设时间,朝向,详情页链接 # 2.写三个文件: # 1.简单py 2.面向对象 3.改成多线程 from selenium import webdriver from lxml import etree def get_element(url): driver.get(url) html = etree.HTML(driver.page_source) return html lis = [] # 存放所有区域包括房子 driver = webdriver.PhantomJS() html = get_element('https://bj.lianjia.com/ershoufang/') city_list = html.xpath('//div[@data-role="ershoufang"]/div/a/@href') city_name_list = html.xpath('//div[@data-role="ershoufang"]/div/a/text()') for num, city in enumerate(city_list): item = {} # 存放一个区域 sum_house = [] # 存放每个区域的房子 item['区域'] = city_name_list[num] # 城区名字 for page in range(1, 3): city_url = 'https://bj.lianjia.com' + city + 'pg' + str(page) html = get_element(city_url) '''名称, 房间规模,建设时间, 朝向, 详情页链接''' title_list = html.xpath('//div[@class="info clear"]/div/a/text()') # 所有标题 detail_url_list = html.xpath('//div[@class="info clear"]/div/a/@href') # 所有详情页 detail_list = html.xpath('//div[@class="houseInfo"]/text()') # 该页所有的房子信息列表, city_price_list = html.xpath('//div[@class="totalPrice"]/span/text()') for i, content in enumerate(title_list): house = {} detail = detail_list[i].split('|') house['名称'] = content # 名称 house['价格']=city_price_list[i]+'万'#价格 house['规模'] = detail[0] + detail[1] # 规模 house['建设时间'] = detail[-2] # 建设时间 house['朝向'] = detail[2] # 朝向 house['详情链接'] = detail_url_list[i] # 详情链接 sum_house.append(house) item['二手房'] = sum_house print(item) lis.append(item)

面向对象+多线程:

import json, threading from selenium import webdriver from lxml import etree from queue import Queue class Lianjia(threading.Thread): def __init__(self, city_list=None, city_name_list=None): super().__init__() self.driver = webdriver.PhantomJS() self.city_name_list = city_name_list self.city_list = city_list def get_element(self, url): # 获取element对象的 self.driver.get(url) html = etree.HTML(self.driver.page_source) return html def get_city(self): html = self.get_element('https://bj.lianjia.com/ershoufang/') city_list = html.xpath('//div[@data-role="ershoufang"]/div/a/@href') city_list = ['https://bj.lianjia.com' + url + 'pg%s' for url in city_list] city_name_list = html.xpath('//div[@data-role="ershoufang"]/div/a/text()') return city_list, city_name_list def run(self): lis = [] # 存放所有区域包括房子 while True: if self.city_name_list.empty() and self.city_list.empty(): break item = {} # 存放一个区域 sum_house = [] # 存放每个区域的房子 item['区域'] = self.city_name_list.get() # 城区名字 for page in range(1, 3): # print(self.city_list.get()) html = self.get_element(self.city_list.get() % page) '''名称, 房间规模,建设时间, 朝向, 详情页链接''' title_list = html.xpath('//div[@class="info clear"]/div/a/text()') # 所有标题 detail_url_list = html.xpath('//div[@class="info clear"]/div/a/@href') # 所有详情页 detail_list = html.xpath('//div[@class="houseInfo"]/text()') # 该页所有的房子信息列表, for i, content in enumerate(title_list): house = {} detail = detail_list[i].split('|') house['名称'] = content # 名称 house['规模'] = detail[0] + detail[1] # 规模 house['建设时间'] = detail[-2] # 建设时间 house['朝向'] = detail[2] # 朝向 house['详情链接'] = detail_url_list[i] # 详情链接 sum_house.append(house) item['二手房'] = sum_house lis.append(item) print(item) if __name__ == '__main__': q1 = Queue()#路由 q2 = Queue()#名字 lj = Lianjia() city_url, city_name = lj.get_city() for c in city_url: q1.put(c) for c in city_name: q2.put(c) # 创建一个列表,列表的数量就是开启线程的数量 crawl_list = [1, 2, 3, 4, 5] for crawl in crawl_list: # 实例化对象 LJ = Lianjia(city_name_list=q2,city_list=q1) LJ.start()

结果:


案例21:爬取笔趣阁的所有小说(requests)

import requests from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Referer': 'http://www.xbiquge.la/7/7931/', 'Cookie': '_abcde_qweasd=0; BAIDU_SSP_lcr=https://www.baidu.com/link?url=jUBgtRGIR19uAr-RE9YV9eHokjmGaII9Ivfp8FJIwV7&wd=&eqid=9ecb04b9000cdd69000000035dc3f80e; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1573124137; _abcde_qweasd=0; bdshare_firstime=1573124137783; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1573125463', 'Accept-Encoding': 'gzip, deflate' } # 获取网站源码 def get_text(url, headers): response = requests.get(url, headers=headers) response.encoding = 'utf-8' return response.text # 获取小说的信息 def get_novelinfo(list1, name_list): for i, url in enumerate(list1): html = etree.HTML(get_text(url, headers)) name = name_list[i] # 书名 title_url = html.xpath('//div[@id="list"]/dl/dd/a/@href') title_url = ['http://www.xbiquge.la' + i for i in title_url] # 章节地址 titlename_list = html.xpath('//div[@id="list"]/dl/dd/a/text()') # 章节名字列表 get_content(title_url, titlename_list, name) # # 获取小说每章节的内容 def get_content(url_list, title_list, name): for i, url in enumerate(url_list): item = {} html = etree.HTML(get_text(url, headers)) content_list = html.xpath('//div[@id="content"]/text()') content = ''.join(content_list) content=content+'\n' item['title'] = title_list[i] item['content'] = content.replace('\r\r', '\n').replace('\xa0', ' ') print(item) with open(name + '.txt', 'a+',encoding='utf-8') as file: file.write(item['title']+'\n') file.write(item['content']) def main(): base_url = 'http://www.xbiquge.la/xiaoshuodaquan/' html = etree.HTML(get_text(base_url, headers)) novelurl_list = html.xpath('//div[@class="novellist"]/ul/li/a/@href') name_list = html.xpath('//div[@class="novellist"]/ul/li/a/text()') get_novelinfo(novelurl_list, name_list) if __name__ == '__main__': main()

多线程

import requests, threading from lxml import etree from queue import Queue class Novel(threading.Thread): def __init__(self, novelurl_list=None, name_list=None): super().__init__() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Referer': 'http://www.xbiquge.la/7/7931/', 'Cookie': '_abcde_qweasd=0; BAIDU_SSP_lcr=https://www.baidu.com/link?url=jUBgtRGIR19uAr-RE9YV9eHokjmGaII9Ivfp8FJIwV7&wd=&eqid=9ecb04b9000cdd69000000035dc3f80e; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1573124137; _abcde_qweasd=0; bdshare_firstime=1573124137783; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1573125463', 'Accept-Encoding': 'gzip, deflate' } self.novelurl_list = novelurl_list self.name_list = name_list # 获取网站源码 def get_text(self, url): response = requests.get(url, headers=self.headers) response.encoding = 'utf-8' return response.text # 获取小说的信息 def get_novelinfo(self): while True: if self.name_list.empty() and self.novelurl_list.empty(): break url = self.novelurl_list.get() # print(url) html = etree.HTML(self.get_text(url)) name = self.name_list.get() # 书名 # print(name) title_url = html.xpath('//div[@id="list"]/dl/dd/a/@href') title_url = ['http://www.xbiquge.la' + i for i in title_url] # 章节地址 titlename_list = html.xpath('//div[@id="list"]/dl/dd/a/text()') # 章节名字列表 self.get_content(title_url, titlename_list, name) # # 获取小说每章节的内容 def get_content(self, url_list, title_list, name): for i, url in enumerate(url_list): item = {} html = etree.HTML(self.get_text(url)) content_list = html.xpath('//div[@id="content"]/text()') content = ''.join(content_list) content = content + '\n' item['title'] = title_list[i] item['content'] = content.replace('\r\r', '\n').replace('\xa0', ' ') print(item) with open(name + '.txt', 'a+', encoding='utf-8') as file: file.write(item['title'] + '\n') file.write(item['content']) #------------------通过多线程,返回每本书的名字和每本书的连接 def get_name_url(self): base_url = 'http://www.xbiquge.la/xiaoshuodaquan/' html = etree.HTML(self.get_text(base_url)) novelurl_list = html.xpath('//div[@class="novellist"]/ul/li/a/@href') name_list = html.xpath('//div[@class="novellist"]/ul/li/a/text()') return novelurl_list, name_list def run(self): self.get_novelinfo() if __name__ == '__main__': n = Novel() url_list, name_list = n.get_name_url() name_queue = Queue() url_queue = Queue() for url in url_list: url_queue.put(url) for name in name_list: name_queue.put(name) crawl_list = [1, 2, 3, 4, 5] # 定义五个线程 for crawl in crawl_list: # 实例化对象 novel = Novel(name_list=name_queue, novelurl_list=url_queue) novel.start()

结果:


案例22:爬取菜鸟教程的python100例

import requests from lxml import etree base_url = 'https://www.runoob.com/python/python-exercise-example%s.html' def get_element(url): headers = { 'cookie': '__gads=Test; Hm_lvt_3eec0b7da6548cf07db3bc477ea905ee=1573454862,1573470948,1573478656,1573713819; Hm_lpvt_3eec0b7da6548cf07db3bc477ea905ee=1573714018; SERVERID=fb669a01438a4693a180d7ad8d474adb|1573713997|1573713863', 'referer': 'https://www.runoob.com/python/python-100-examples.html', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } response = requests.get(url, headers=headers) return etree.HTML(response.text) def write_py(i, text): with open('练习实例%s.py' % i, 'w', encoding='utf-8') as file: file.write(text) def main(): for i in range(1, 101): html = get_element(base_url % i) content = '题目:' + html.xpath('//div[@id="content"]/p[2]/text()')[0] + '\n' fenxi = html.xpath('//div[@id="content"]/p[position()>=2]/text()')[0] daima = ''.join(html.xpath('//div[@class="hl-main"]/span/text()')) + '\n' haha = '"""\n' + content + fenxi + daima + '\n"""' write_py(i, haha) print(fenxi) if __name__ == '__main__': main()


案例23:爬取新浪微博头条前20页(ajax+mysql)

import requests, pymysql from lxml import etree def get_element(i): base_url = 'https://weibo.com/a/aj/transform/loadingmoreunlogin?' headers = { 'Referer': 'https://weibo.com/?category=1760', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } params = { 'ajwvr': '6', 'category': '1760', 'page': i, 'lefnav': '0', 'cursor': '', '__rnd': '1573735870072', } response = requests.get(base_url, headers=headers, params=params) response.encoding = 'utf-8' info = response.json() return etree.HTML(info['data']) def main(): for i in range(1, 20): html = get_element(i) # 标题,发布人,发布时间,详情链接 title = html.xpath('//a[@class="S_txt1"]/text()') author_time = html.xpath('//span[@class]/text()') author = [author_time[i] for i in range(len(author_time)) if i % 2 == 0] time = [author_time[i] for i in range(len(author_time)) if i % 2 == 1] url = html.xpath('//a[@class="S_txt1"]/@href') for j,tit in enumerate(title): title1=tit time1=time[j] url1=url[j] author1=author[j] # print(title1,url1,time1,author1) connect_mysql(title1,time1,author1,url1) def connect_mysql(title, time, author, url): db = pymysql.connect(host='localhost', user='root', password='123456',database='news') cursor = db.cursor() sql = 'insert into sina_news(title,send_time,author,url) values("' + title + '","' + time + '","' + author + '","' + url + '")' print(sql) cursor.execute(sql) db.commit() cursor.close() db.close() if __name__ == '__main__': main()

提前创库news和表sina_news

create table sina_news( id int not null auto_increment primary key, title varchar(100), send_time varchar(100), author varchar(20), url varchar(100) );


案例24:爬取搜狗指定图片(requests+多线程)

```python import requests, json, threading, time, os from queue import Queue class Picture(threading.Thread): # 初始化 def __init__(self, num, search, url_queue=None): super().__init__() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } self.num = num self.search = search # 获取爬取的页数的每页图片接口url def get_url(self): url_list = [] for start in range(self.num): url = 'https://pic.sogou.com/pics?query=' + self.search + '&mode=1&start=' + str( start * 48) + '&reqType=ajax&reqFrom=result&tn=0' url_list.append(url) return url_list # 获取每页的接口资源详情 def get_page(self, url): response = requests.get(url.format('蔡徐坤'), headers=self.headers) return response.text # def run(self): while True: # 如果队列为空代表制定页数爬取完毕 if url_queue.empty(): break else: url = url_queue.get() # 本页地址 data = json.loads(self.get_page(url)) # 获取到本页图片接口资源 try: # 每页48张图片 for i in range(1, 49): pic = data['items'][i]['pic_url'] reponse = requests.get(pic) # 如果文件夹不存在,则创建 if not os.path.exists(r'C:/Users/Administrator/Desktop/' + self.search): os.mkdir(r'C:/Users/Administrator/Desktop/' + self.search) with open(r'C:/Users/Administrator/Desktop/' + self.search + '/%s.jpg' % ( str(time.time()).replace('.', '_')), 'wb') as f: f.write(reponse.content) print('下载成功!') except: print('该页图片保存完毕') if __name__ == '__main__': # 1.获取初始化的爬取url num = int(input('请输入爬取页数(每页48张):')) content = input('请输入爬取内容:') pic = Picture(num, content) url_list = pic.get_url() # 2.创建队列 url_queue = Queue() for i in url_list: url_queue.put(i) # 3.创建线程任务 crawl = [1, 2, 3, 4, 5] for i in crawl: pic = Picture(num, content, url_queue=url_queue) pic.start()

案例25:爬取链家网北京所有房子(requests+多线程)

链家:https://bj.fang.lianjia.com/loupan/

1、获取所有的城市的拼音2、根据拼音去拼接url,获取所有的数据。3、列表页:楼盘名称,均价,建筑面积,区域,商圈详情页:户型([“8室5厅8卫”, “4室2厅3卫”, “5室2厅2卫”]),朝向,图片(列表),用户点评(选爬)

难点1: 当该区没房子的时候,猜你喜欢这个会和有房子的块class一样,因此需要判断 难点2: 获取每个区的页数,使用js将页数隐藏 https://bj.fang.lianjia.com/loupan/区/pg页数%s 我们可以发现规律,明明三页,当我们写pg5时候,会跳转第一页 因此我们可以使用while判断,当每个房子的链接和该区最大房子数相等代表该区爬取完毕

完整代码:

import requests from lxml import etree # 获取网页源码 def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', } response = requests.get(url, headers=headers) return response.text # 获取城市拼音列表 def get_city_url(): url = 'https://bj.fang.lianjia.com/loupan/' html = etree.HTML(get_html(url)) city = html.xpath('//div[@class="filter-by-area-container"]/ul/li/@data-district-spell') city_url = ['https://bj.fang.lianjia.com/loupan/{}/pg%s'.format(i) for i in city] return city_url # 爬取对应区的所有房子url def get_detail(url): # 使用第一页来判断是否有分页 html = etree.HTML(get_html(url % (1))) empty = html.xpath('//div[@class="no-result-wrapper hide"]') if len(empty) != 0: # 不存在此标签代表没有猜你喜欢 i = 1 max_house = html.xpath('//span[@class="value"]/text()')[0] house_url = [] while True: # 分页 html = etree.HTML(get_html(url % (i))) house_url += html.xpath('//ul[@class="resblock-list-wrapper"]/li/a/@href') i += 1 if len(house_url) == int(max_house): break detail_url = ['https://bj.fang.lianjia.com/' + i for i in house_url] # 该区所有房子的url info(detail_url) # 获取每个房子的详细信息 def info(url): for i in url: item = {} page = etree.HTML(get_html(i)) item['name'] = page.xpath('//h2[@class="DATA-PROJECT-NAME"]/text()')[0] item['price_num'] = page.xpath('//span[@class="price-number"]/text()')[0] + page.xpath( '//span[@class="price-unit"]/text()')[0] detail_page = etree.HTML(get_html(i + 'xiangqing')) item['type'] = detail_page.xpath('//ul[@class="x-box"]/li[1]/span[2]/text()')[0] item['address'] = detail_page.xpath('//ul[@class="x-box"]/li[5]/span[2]/text()')[0] item['shop_address'] = detail_page.xpath('//ul[@class="x-box"]/li[6]/span[2]/text()')[0] print(item) def main(): # 1、获取所有的城市的拼音 city = get_city_url() # 2、根据拼音去拼接url,获取所有的数据。 for url in city: get_detail(url) if __name__ == '__main__': main()

多线程版:

import requests, threading from lxml import etree from queue import Queue import pymongo class House(threading.Thread): def __init__(self, q=None): super().__init__() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', } self.q = q # 获取网页源码 def get_html(self, url): response = requests.get(url, headers=self.headers) return response.text # 获取城市拼音列表 def get_city_url(self): url = 'https://bj.fang.lianjia.com/loupan/' html = etree.HTML(self.get_html(url)) city = html.xpath('//div[@class="filter-by-area-container"]/ul/li/@data-district-spell') city_url = ['https://bj.fang.lianjia.com/loupan/{}/pg%s'.format(i) for i in city] return city_url # 爬取对应区的所有房子url def get_detail(self, url): # 使用第一页来判断是否有分页 html = etree.HTML(self.get_html(url % (1))) empty = html.xpath('//div[@class="no-result-wrapper hide"]') if len(empty) != 0: # 不存在此标签代表没有猜你喜欢 i = 1 max_house = html.xpath('//span[@class="value"]/text()')[0] house_url = [] while True: # 分页 html = etree.HTML(self.get_html(url % (i))) house_url += html.xpath('//ul[@class="resblock-list-wrapper"]/li/a/@href') i += 1 if len(house_url) == int(max_house): break detail_url = ['https://bj.fang.lianjia.com/' + i for i in house_url] # 该区所有房子的url self.info(detail_url) # 获取每个房子的详细信息 def info(self, url): for i in url: item = {} page = etree.HTML(self.get_html(i)) item['name'] = page.xpath('//h2[@class="DATA-PROJECT-NAME"]/text()')[0] item['price_num'] = page.xpath('//span[@class="price-number"]/text()')[0] + page.xpath( '//span[@class="price-unit"]/text()')[0] detail_page = etree.HTML(self.get_html(i + 'xiangqing')) item['type'] = detail_page.xpath('//ul[@class="x-box"]/li[1]/span[2]/text()')[0] item['address'] = detail_page.xpath('//ul[@class="x-box"]/li[5]/span[2]/text()')[0] item['shop_address'] = detail_page.xpath('//ul[@class="x-box"]/li[6]/span[2]/text()')[0] print(item) def run(self): # 1、获取所有的城市的拼音 # city = self.get_city_url() # 2、根据拼音去拼接url,获取所有的数据。 while True: if self.q.empty(): break self.get_detail(self.q.get()) if __name__ == '__main__': # 1.先获取区列表 house = House() city_list = house.get_city_url() # 2.将去加入队列 q = Queue() for i in city_list: q.put(i) # 3.创建线程任务 a = [1, 2, 3, 4] for i in a: p = House(q) p.start()
最新回复(0)