福利爬虫妹子图之获取种子url

mac2022-06-30 21

import os import uuid from lxml import html import aiofiles import logging from ruia import Spider, Request from ruia_ua import middleware from aiohttp探究.db import MotorBase import datetime demo = "https://www.mzitu.com/page/{}/" class BaiduImgSpider(Spider): start_urls = [] img_path = 'data/' async def parse(self, res): self.mongo_db = MotorBase().get_db('img_data') source = res.html root = html.fromstring(source) url_list = root.xpath("//ul[@id='pins']/li/a/@href") name_list = root.xpath("//ul[@id='pins']/li/a/img/@alt") next_page_urls = [] headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'referer': 'https://www.mzitu.com/mm/', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', } for each_data in url_list: next_page_urls.append(each_data) for name, url in zip(name_list, next_page_urls): yield Request(url, headers=headers, callback=self.next_page, metadata={"name": name}, res_type='text') async def next_page(self, res): source = res.html root = html.fromstring(source) name = res.metadata.get("name") refere_url = res.url # print(name, refere_url) # 最后一页xpath max_page_list = "//div[@class='pagenavi']/a[last()-1]/span/text()" _max_page_num = root.xpath(max_page_list) max_page_num = _max_page_num[0] if _max_page_num else None img_url_node = root.xpath("//div[@class='main-image']/p/a/img/@src") img_url = img_url_node[0] if img_url_node else None headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'if-modified-since': 'Thu, 15 Nov 2018 04:24:11 GMT', 'if-none-match': '"5becf4eb-1b7d4"', 'referer': refere_url, 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', } datas = [] # yield Request(img_url, callback=self.save_img, headers=headers, # metadata={"url": img_url, "name": name, "id": "1"}, # res_type='bytes') data1 = {'url': img_url, "status": "0", 'title': name, "img_id": "1", "headers": headers, "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} datas.append(data1) # print("最大页数", max_page_num) for page in range(2, int(max_page_num) + 1): headers["referer"] = f"{refere_url}{str(page).zfill(2)}" next_img_url = img_url.replace("01.", f"{str(page).zfill(2)}.") # print("next",next_img_url) # yield Request(next_img_url, callback=self.save_img, headers=headers, # metadata={"url": img_url, "name": name, "id": page}, # res_type='bytes') data2 = {'url': next_img_url, "status": "0", 'title': name, "img_id": page, "headers": headers, "crawler_date": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} datas.append(data2) await self.mongo_db.mzitu2.insert_many(datas) async def save_img(self, res): url = res.metadata.get("url") _img_type = url.rsplit(".", 1) img_type = _img_type[1] if _img_type else None name = res.metadata.get("name") img_id = res.metadata.get("id") img_all_path = f"{self.img_path}{name}/" if not os.path.exists(img_all_path): os.makedirs(img_all_path) # img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-') img_name = f"{img_id}.{img_type}" async with aiofiles.open(img_all_path + img_name, 'wb') as fp: await fp.write(res.html) logging.info('Img downloaded successfully in {dir}'.format(dir=img_all_path + img_name)) if __name__ == '__main__': word = '妹子图' # 目录名 pages = 201 # 页数 BaiduImgSpider.img_path = word + "/" BaiduImgSpider.start_urls = [demo.format(page) for page in range(pages)] BaiduImgSpider.start(middleware=middleware)

db.py

import asyncio from motor.motor_asyncio import AsyncIOMotorClient class MotorBase: """ About motor's doc: https://github.com/mongodb/motor """ _db = {} _collection = {} def __init__(self, loop=None): self.motor_uri = '' self.loop = loop or asyncio.get_event_loop() def client(self, db): # motor self.motor_uri = f"mongodb://localhost:27017/{db}" return AsyncIOMotorClient(self.motor_uri, io_loop=self.loop) def get_db(self, db='test'): """ Get a db instance :param db: database name :return: the motor db instance """ if db not in self._db: self._db[db] = self.client(db)[db] return self._db[db]

转载于:https://www.cnblogs.com/c-x-a/p/10014425.html