import urllib
from urllib
import request, parse
from lxml
import etree
class CarModel:
def __init__(self, search_name=
'车模', search_page=50, begin_page=1
):
self.name =
search_name
self.url =
'https://tieba.baidu.com/f?'
self.search_page =
search_page
self.begin_page =
begin_page
self.tie_ba_list =
[]
self.number =
0
self.header = {
'User_agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def download_img(self, link, page_num, index01, index02):
img_ =
urllib.request.Request(link)
respos =
urllib.request.urlopen(img_)
img_data =
respos.read()
file = open(
'../image/{0}_{1}_{2}.jpg'.format(page_num, index01, index02),
'wb')
file.write(img_data)
file.close()
def find_image(self, link, page_num, index01):
requests = urllib.request.Request(headers=self.header, url=
link)
responses =
urllib.request.urlopen(requests)
html = responses.read()
# 获取html信息
new_html = etree.HTML(html)
# 将html转换
image_link = new_html.xpath(
'//img[@class="BDE_Image"]/@src')
# xpath进行信息抽取
tmp_num =
0
for i
in image_link:
tmp_num += 1
# 进行图片编号
self.download_img(i, page_num, index01, tmp_num)
def find_link(self, link, page_num):
requests = urllib.request.Request(headers=self.header, url=
link)
responses =
urllib.request.urlopen(requests)
html = responses.read().decode(
'utf-8')
new_html =
etree.HTML(html)
# 寻找图片超链接
link_list = new_html.xpath(
'//div[@class="threadlist_lz clearfix"]/div/a/@href')
tmp_num =
0
for i
in link_list:
tmp_num += 1
tmp_link =
'https://tieba.baidu.com{0}'.format(i)
self.find_image(tmp_link, page_num, tmp_num)
def begin(self):
for i
in range(self.begin_page, self.search_page+1
):
tmp_pn = (i-1)*50
words_01 = {
'kw': self.name}
words_02 = {
'pn': tmp_pn}
words_01 =
urllib.parse.urlencode(words_01)
words_02 =
urllib.parse.urlencode(words_02)
tmp_url =
'{0}{1}&ie=utf-8&{2}'.format(self.url, words_01, words_02)
self.find_link(tmp_url, tmp_pn/50
)
if __name__ ==
'__main__':
car =
CarModel()
car.begin()
最终爬取效果
转载于:https://www.cnblogs.com/LLWH134/p/10376302.html
相关资源:JAVA上百实例源码以及开源项目