async for 在爬虫中的使用例子

mac2022-06-30  18

import asyncio import re import typing from concurrent.futures import Executor, ThreadPoolExecutor from urllib.request import urlopen DEFAULT_EXECUTOR = ThreadPoolExecutor(4) ANCHOR_TAG_PATTERN = re.compile(b"<a.+?href=[\"|\'](.*?)[\"|\'].*?>", re.RegexFlag.MULTILINE | re.RegexFlag.IGNORECASE) async def wrap_async(generator: typing.Generator, executor: Executor = DEFAULT_EXECUTOR, sentinel=None, *, loop: asyncio.AbstractEventLoop = None): """ We wrap a generator and return an asynchronous generator instead :param iterator: :param executor: :param sentinel: :param loop: :return: """ if not loop: loop = asyncio.get_running_loop() while True: # 相当于执行next(generator) result = await loop.run_in_executor(executor, next, generator, sentinel) if result == sentinel: # 如果链接为空跳出 break yield result def follow(*links): """ :param links: :return: """ return ((link, urlopen(link).read()) for link in links) def get_links(text: str): """ Get back an iterator that gets us all the links in a text iteratively and safely :param text: :return: """ # Always grab the last match, because that is how a smart http parser would interpret a malformed # anchor tag return (match.groups()[-1] for match in ANCHOR_TAG_PATTERN.finditer(text) # This portion is a safeguard against None matches and zero href matches if hasattr(match, "groups") and len(match.groups())) async def main(*links): async for current, body in wrap_async(follow(*links)): print("Current url:", current) print("Content:", body) async for link in wrap_async(get_links(body)): print(link) asyncio.run(main("https://www.cnblogs.com/c-x-a"))

转载于:https://www.cnblogs.com/c-x-a/p/11028456.html

最新回复(0)