贪心学院 scrapy爬虫

mac2022-06-30  19

生成爬虫

scrapy genspider 爬虫名 网址

打开调试用shell

scrapy shell 网址

主体 stock.py

# -*- coding: utf-8 -*- import re from urllib import parse import scrapy from stock_spider.items import StockItem class StockSpider(scrapy.Spider): name = 'stock' allowed_domains = ['pycs.greedyai.com/'] #域名 start_urls = ['http://pycs.greedyai.com/'] #地址 def parse(self, response): post_urls= response.xpath("//a/@href").extract() #获取子网址 for post_url in post_urls: yield scrapy.Request(url=parse.urljoin(response.url,post_url),callback=self.parse_detail,dont_filter=True) #整合成可访问的网址 def parse_detail(self,response): stock_item= StockItem() #董事会成员 stock_item['names']=self.get_name(response) #性别 # stock_item['sexs']=self.get_sex(response) #部分人员无性别资料导致后来的list越界 #股票代码 stock_item['codes']=self.get_code(response) #成员职位 stock_item['positions']=self.get_position(response) yield stock_item def get_name(self,response): name=response.xpath("//td[@class=\"tc name\"]/a/text()").extract() return name def get_sex(self,response): sex_temp = response.xpath("//td[@class=\"intro\"]/text()").extract() sex_list=[] for sex_info in sex_temp: try: sex=re.findall("男|女",sex_info)[0] sex_list.append(sex) except(IndexError): #捕获到该异常,则继续往下读取,因为视频上显示在有用数据前后有一些无效的转义字符 continue return sex_list def get_code(self,response): code_temp=response.xpath("/html/body/div[3]/div[1]/div[2]/div[1]/h1/a/@title").extract() for code_info in code_temp: code=re.findall("\d+",code_info) return code def get_position(self,response): position = response.xpath("//td[@class=\"tl\"]/text()").extract() return position

main.py

from scrapy.cmdline import execute #调试用 import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy","crawl","stock"])

items.py

# -*- coding: utf-8 -*- import scrapy class StockSpiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class StockItem(scrapy.Item): #新添加 names=scrapy.Field() # sexs=scrapy.Field() codes=scrapy.Field() positions=scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import os class StockSpiderPipeline(object): def process_item(self, item, spider): return item class StockPipeline(object): #新添加 # 类被加载时创建一个文件 def __init__(self): self.file=open("executive_prep.csv","a+") # a+有则追加,无则创建 def process_item(self, item, spider): #判断文件是否为空,为空则写入标头:姓名,性别,股票代码,职位 #为空则追加写文件 if os.path.getsize("executive_prep.csv"): #获取文件大小 #开始写文件 self.write_content(item) else: self.file.write("姓名,性别,股票代码,职位\n") self.file.flush() def write_content(self,item): names = item['names'] # sexs = item['sexs'] codes = item['codes'] positions = item['positions'] for i in range(len(names)): result=names[i]+","+codes[0]+","+positions[i]+"\n" self.file.write(result)

settings.py

# -*- coding: utf-8 -*- BOT_NAME = 'stock_spider' SPIDER_MODULES = ['stock_spider.spiders'] NEWSPIDER_MODULE = 'stock_spider.spiders' #新添加 # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'stock_spider.pipelines.StockSpiderPipeline': 300, 'stock_spider.pipelines.StockPipeline': 300, #新添加 }

转载于:https://www.cnblogs.com/j-c-y/p/11461677.html

相关资源:python编写简单爬虫资料汇总
最新回复(0)