多线程爬取图片

mac2026-01-09  8

""" 多线程彭于晏 """ import requests from urllib import request import os import re from queue import Queue import threading from urllib.parse import urlencode class Procuder(threading.Thread): #生产者 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } def __init__(self,page_queue,imag_queue,*args,**kwargs): #初始化 两个队列 super().__init__(*args,**kwargs) self.page_queue = page_queue self.imag_queue = imag_queue def run(self): while True: if self.page_queue.empty(): #判断 如果队列为空 就退出循环 break url = self.page_queue.get() self.pares_page(url) def pares_page(self,url): response = requests.get(url,headers = self.headers) response.encoding = 'utf-8' if response.status_code == 200: response = response.json() data = response.get('data') if data: for item in data: image = item.get('hoverURL') name = item.get('fromPageTitleEnc') if image != None: # name = re.sub(r'[\??\.,。\!\*()/>]', '', name) reg = "[^0-9A-Za-z\u4e00-\u9fa5]" # 使用删除字符串里的符号 ?!,。之类的 保留允许存在的命名的字符串 name = re.sub(reg, '', name) suffix = os.path.splitext(image)[1] filename = name + suffix self.imag_queue.put((image, filename)) #将图片URL和名字放入队列(元组) request.urlretrieve(image, 'images/' + filename) # 下载保存 print(filename + '下载完成') else: print('None') # html = etree.HTML(text) # imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') class Consumer(threading.Thread): def __init__(self,page_queue,imag_queue,*args,**kwargs): super().__init__(*args,**kwargs) self.page_queue = page_queue self.imag_queue = imag_queue def run(self): while True: if self.page_queue.empty() and self.imag_queue.empty(): break image, filename = self.imag_queue.get() # 将图片URL和名字从队列取出 元组的解开方式 request.urlretrieve(image, 'images/' + filename) # 下载保存 print(filename + '下载完成') def main(): page_queue = Queue(200) # 页面队列容量 imag_queue = Queue(1000) # 图片URL队列容量 pn = 30 for x in range(30): pn = pn*x data = { "tn": "resultjson_com", # 这里不能有空格 否则会打印的是其他的网页源码 "ipn": " rj", "ct": "201326592", "queryWord": "彭于晏图片", "ie": "utf-8", "oe": "utf-8", "adpicid": "", "copyright": "", "word": " 彭于晏图片", "pn": pn, } url = "https://image.baidu.com/search/acjson?" + urlencode(data) page_queue.put(url) for i in range(10): #生产者线程 t = Procuder(page_queue,imag_queue) t.start() for i in range(10): #消费者线程 t = Consumer(page_queue, imag_queue) t.start() if __name__ == '__main__': main()
最新回复(0)