Python爬虫基础--爬取车模照片

mac2022-06-30  119

import urllib from urllib import request, parse from lxml import etree class CarModel: def __init__(self, search_name='车模', search_page=50, begin_page=1): self.name = search_name self.url = 'https://tieba.baidu.com/f?' self.search_page = search_page self.begin_page = begin_page self.tie_ba_list = [] self.number = 0 self.header = {'User_agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} def download_img(self, link, page_num, index01, index02): img_ = urllib.request.Request(link) respos = urllib.request.urlopen(img_) img_data = respos.read() file = open('../image/{0}_{1}_{2}.jpg'.format(page_num, index01, index02), 'wb') file.write(img_data) file.close() def find_image(self, link, page_num, index01): requests = urllib.request.Request(headers=self.header, url=link) responses = urllib.request.urlopen(requests) html = responses.read() # 获取html信息 new_html = etree.HTML(html) # 将html转换 image_link = new_html.xpath('//img[@class="BDE_Image"]/@src') # xpath进行信息抽取 tmp_num = 0 for i in image_link: tmp_num += 1 # 进行图片编号 self.download_img(i, page_num, index01, tmp_num) def find_link(self, link, page_num): requests = urllib.request.Request(headers=self.header, url=link) responses = urllib.request.urlopen(requests) html = responses.read().decode('utf-8') new_html = etree.HTML(html) # 寻找图片超链接 link_list = new_html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href') tmp_num = 0 for i in link_list: tmp_num += 1 tmp_link = 'https://tieba.baidu.com{0}'.format(i) self.find_image(tmp_link, page_num, tmp_num) def begin(self): for i in range(self.begin_page, self.search_page+1): tmp_pn = (i-1)*50 words_01 = {'kw': self.name} words_02 = {'pn': tmp_pn} words_01 = urllib.parse.urlencode(words_01) words_02 = urllib.parse.urlencode(words_02) tmp_url ='{0}{1}&ie=utf-8&{2}'.format(self.url, words_01, words_02) self.find_link(tmp_url, tmp_pn/50) if __name__ == '__main__': car = CarModel() car.begin()

最终爬取效果

 

转载于:https://www.cnblogs.com/LLWH134/p/10376302.html

相关资源:JAVA上百实例源码以及开源项目
最新回复(0)