内涵段子所有段子Spider

mac2024-04-20  42

代码不是很完美, 抓取的数据中少数几个带有html标签! ╮(╯▽╰)╭ 只怪自己太cai, 还没有想好怎么把正则表达式写的再完美一点。

抓取代码

#encoding=utf-8 import requests import json import re class NeihanSpider: def __init__(self): self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"} self.url = "http://www.neihanshu.net/text/index{}{}.html" self.sum = 0 def get_url_list(self, url): url_list = [self.url.format("", "")] for i in range(133): url_list.append(self.url.format("_", i + 2)) return url_list def parse_url(self, url): response = requests.get(url, headers=self.headers) return response.content.decode() def get_content_list(self, html_str): return re.findall(r"<div class=\"article-body\".*?<p>([^&quot;].*?)</p>", html_str, re.S) def save_content_list(self, content_list, page_number): with open("neihan.txt", "a", encoding="utf-8") as f: for content in content_list: self.sum += 1 # 记录是第几个段子 f.write(str(self.sum) + " " + json.dumps(content, ensure_ascii=False) + "\n\n") print("第{}页保存成功".format(page_number + 1)) def run(self): # 生成rul 列表 url_list = self.get_url_list(self.url) # 发送请求, 获取响应 for url in url_list: html_str = self.parse_url(url) # 提取数据 content_list = self.get_content_list(html_str) # 保存数据 self.save_content_list(content_list, url_list.index(url)) if __name__ == "__main__": neihanspider = NeihanSpider() neihanspider.run()
最新回复(0)