爬虫之QQ说说乐园整站爬取

mac2025-09-14 52

import requests from bs4 import BeautifulSoup import re import os #页面采集 #------------------------------------------------------------------------------------------------------------------ #获取页面 def GetHTML(url): headers={'User-Agent': "Mozilla/5.0"} try: r=requests.get(url,headers=headers) r.raise_for_status() r.encoding=r.apparent_encoding html=r.text.encode(r.encoding,"ignore").decode("gb2312","ignore") soup=BeautifulSoup(html,"lxml") return soup except: print("获取失败") #单页面采集 def PageCollect(url): global PageList soup=GetHTML(url) dt=soup.findAll("dt") for i in dt: a=i.find("a") url=a.attrs["href"] title=a.attrs["title"] PageList.append([url,title]) #翻页采集 def handover_Collect(url): soup=GetHTML(url) a=soup.find("a",text="下一页") href=a.attrs["href"] url=base+href print(url) PageCollect(url) return url #多页面采集 def Pages_Collect(base): PageCollect(base)#第一个页面采集 url=handover_Collect(base)#第二个页面采集 #循环采集 while 1: try: url=handover_Collect(url) except: break #----------------------------------------------------------------------------------------------------------------------- #单页面下载 #-------------------------------------------------------------------------------------------------------------------- def GetTEXT(soup): content=soup.find("ul",class_="arzw").text content=re.sub("\n\n\n\r\n\r\n ","",content) TextList=content.split("\n\r\n") return TextList def SaveTEXT(TextList,title,num): global filename file_savePath="E:/段子/" FileName=str(num)+"."+filename+"/" TextName=title+".doc" filedir=file_savePath+FileName if not os.path.exists(filedir): os.mkdir(filedir) with open(filedir+TextName,mode="w",encoding="utf-8") as f: for i in range(len(TextList)): f.write(TextList[i]+"\n") print(title,"下载完成") def PageDownload(url,title,num): soup=GetHTML(url) TextList=GetTEXT(soup) SaveTEXT(TextList,title,num) #多页面下载 #----------------------------------------------------------------------------------------------- def Pages_Download(): global PageList i=0 for Page in PageList: href=Page[0] title=Page[1] base="http://www.qqssly.com" url=base+href num=(i//10)+1 PageDownload(url,title,num) i=i+1 #----------------------------------------------------------------------------------------------------- #选择下载的内容 def Show_Download_List(): global Download_Option url="http://www.qqssly.com/" soup=GetHTML(url) content=soup.find("ul",class_="dht2") content=content.findAll("a") for a in content: url=a.attrs["href"] title=a.text Download_Option.append([url,title]) print("请输入要下载的说说的类型") for i in range(len(Download_Option)-2): print(i+1,Download_Option[i][1]) #--------------------------------------------------------------------------------------------------------------- Download_Option=[] PageList=[] Show_Download_List() try: Number=int(input("请输入要下载说说类型的编号")) base=Download_Option[Number-1][0] filename=Download_Option[Number-1][1] Pages_Collect(base) Pages_Download() except: print("输入错误")

最新回复(0)