aiohttp使用队列

mac2022-06-30 26

获取百度的搜索结果,然后把百度的长链接,获取到真实的url

import time import aiofiles import aiohttp import asyncio from lxml import etree from asyncio import Queue from itertools import product import async_timeout MAX_THREADS = 50 class BaiduSpider: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36" "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"} self.q = Queue() self.q2 = Queue() def url_generator(self): with open('keyword.txt', 'r', encoding='utf8') as f: for key in product(f, range(0, 5)): yield f"https://www.baidu.com/s?wd={key[0].strip()}&pn={key[1]}" async def fetch(self, session, url): try: with async_timeout.timeout(1): async with session.get(url, headers=self.headers) as resp: if resp.status in [200, 201]: return await resp.text() except Exception as e: pass async def work(self, session): while not self.q.empty(): url = await self.q.get() html = await self.fetch(session, url) datas = await self.parser(session, html) self.q.task_done() async def parser(self, session, html): if html: tree = etree.HTML(html) datas = tree.xpath('//h3[@class="t"]/a') for data in datas: title = data.xpath('string(.)') link = data.xpath('@href')[0] data = [title, link if title else ''] self.q2.put_nowait(data) await self.work2(session) async def work2(self, session): while not self.q2.empty(): data = await self.q2.get() try: with async_timeout.timeout(1): async with session.get(data[1], headers=self.headers) as resp2: print(resp2.url, data[0]) async with aiofiles.open('links.txt', 'a', encoding='utf-8') as fd: if str(resp2.url) not in 'links.txt': await fd.write(f"{data[0]},{resp2.url}\n") except Exception as e: pass async def download(self): urls = self.url_generator() conn = aiohttp.TCPConnector(verify_ssl=False) # 防止ssl报错 [self.q.put_nowait(url) for url in urls] async with aiohttp.ClientSession(connector=conn) as session: tasks = [asyncio.ensure_future(self.work(session)) for _ in range(MAX_THREADS)] await asyncio.wait(tasks) def run(self): start_time = time.time() loop = asyncio.get_event_loop() tasks1 = asyncio.gather(self.download()) loop.run_until_complete(tasks1) print(f'全程用时{time.time() - start_time}秒') if __name__ == '__main__': baidu = BaiduSpider() items = baidu.run()

转载于:https://www.cnblogs.com/c-x-a/p/10668977.html

最新回复(0)