python爬虫 小说下载

mac2024-07-24  63

笔趣阁的网页结构比较简单,但也有点乱,需要注意细节。

需要增加运行日志 #-*- coding:utf8 -*- #从https://www.xbiquge.cc/网站下载小说 #https://www.xbiquge.cc/book/9860/ #https://www.xbiquge.cc/book/9860/7063460.html #catalog目录,chapter章节 #r'[\u4e00-\u9fa5]+' 1到任意多个汉字 #r'\d{1,10}' 章节链接编号,章节链接在类名为box_con的第2个div中 #r'[\u4e00-\u9fa5]+\d{1,4}[\u4e00-\u9fa5]+ [\u4e00-\u9fa5]+' 小说章节名 import requests import json import re import time import os import sys from bs4 import BeautifulSoup from docx import Document from docx.shared import Cm headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'} url = input('please input url:') if len(url) < 24: #url = 'https://www.xbiquge.cc/book/9860/'#为了测试方便,设置默认地址 url = 'https://www.xbiquge.cc/book/14779/' rootPath = r'C:\Users\QQ\Desktop\ls\py\{}' #name = '我的微信连三界 狼烟新书'#name和saveCatalog()必须要注释掉一个 name = '一世兵王 我本疯狂新书' def getCatalog(): def saveCatalog(): rep = requests.get(url, headers = headers) print(rep.text[:10]) rep.encoding = 'gbk' soup = BeautifulSoup(rep.text, 'lxml')#解析 title = soup.title.contents[0] print(title) global name name = (re.findall('(.+?) ', title))[0] + ' ' + (re.findall('_(.+?)_', title))[0]#小说名 print(name) mkDir(path = rootPath.format(name))#为之后将要保存的文件创建文件夹 f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录') with open(f1, 'w') as f: f.write(rep.text) #saveCatalog()#只需要运行一次 def findAllChapter(): f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录') f2 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'章节链接') with open(f1, 'r') as f: rep = f.read() soup = BeautifulSoup(rep, 'lxml') s = str(soup.find(id='list')) soup = BeautifulSoup(s, 'lxml') ss = soup.findAll('a')[:] global cul,cnl cul = re.findall(r'\d{7,8}.html', str(s))#ChapterUrlList #cnl = re.findall(r'第\d{1,4}章 [\u4e00-\u9fa5]+', str(ss))#ChapterNameList,我的微信连三界,漏掉了第373章 B级任务,修改 #cnl = re.findall(r'>(第{0,1}\d{1,4}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了010 章 搂腰算非礼吗? #cnl = re.findall(r'>(第{0,1}\d{1,4} {0,1}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了137章无名字 cnl = re.findall(r'>(第?\d{1,4} ?章? ?.*?)<', str(s)) print(len(ss),len(cul),len(cnl)) print(cul,cnl) print('len(cul):',len(cul),'len(cnl):',len(cnl)) for i in range(0,1588): #检查正则表达式,检查完后需注释掉 c = str(ss[i]) cu = re.search(r'\d{7,8}.html',str(c)).group() cn = c[c.index('.html')+7:-4] if cu != cul[i] or cn != cnl[i]: print(cu,cul[i],cu==cul[i],cn,cnl[i],cn==cnl[i]) break with open(f2, 'w') as f: for u,n in zip(cul,cnl): f.write(u + n + '\n') if len(cul) == len(cnl): with open(f2, 'w') as f: for u,n in zip(cul,cnl): f.write(u + n + '\n') print('All url and name of chapters from source have been saved in this file:{}'.format(f2)) else: print('Rules require changes the regular expression')#需要修改正则表达式来适应网页的变化 #如果未保存小说目录信息,则获取并保存,反之,开始提取各个章节的信息 findAllChapter() def mkDir(path): if not os.path.exists(path): os.makedirs(path) def missingChapter(): new = int(re.search(r'\d{1,4}',cnl[-1]).group()) #print('newest chapter: ',cnl[-1]) nl = [0]#chapter number list ml = []#missing chapter number list for i in range(len(cnl)): nl.append(int(re.search(r'\d{1,4}',cnl[i]).group())) d = nl[i] - nl[i-1]-1 while d>0: ml.append(nl[i]-d) #print("missing chapters' number:{}!!!".format(ml[-1]),d) d-=1 return nl ''' for i in ml: if str(i) in str(cnl): print(i,True) else: print(i,False) ''' def saveChapter(): f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name) #print(list(zip(cul[1900:],cnl[1900:]))) with open(f3, 'a') as f: for cu,cn in zip(cul[modify():],cnl[modify():]):#开始位置根据实际情况调整 rep = requests.get(url + cu, headers = headers) rep.encoding = 'gbk' content = '' for s in rep.text.splitlines(): test1 = re.findall(r'&nbsp;&nbsp;&nbsp;&nbsp;(.+)<', s) if test1: content += test1[0] + '\n' if len(content)>1200:#章节字数少于1200则不写入文件 f.write(content) f.write('\n') print('contents has been writen to file which from : {} {}'.format(cu,cn)) else: f.write('\n') print(content) print('There are problems in this chapter : {} {} !!!'.format(cu,cn)) break def runlog(): #记录每次运行时长、运行时间、已保存的章节、缺失章节、增加的章节等信息 pass def modify(): #检查文件中是否有广告信息、多余字符、空章节。根据检查结果对saveChapter()进行完善 f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name) f4 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'modify') with open(f3, 'r') as f, open(f4, 'w') as fs: cc(f) c=0 li = f.readlines() #print(type(li),len(li)) for n,i in enumerate(li): fs.write(i) if i == '\n' and n < len(li)-1: c+=1 if '第' not in li[n+1] and '章' not in li[n+1]: #print(cnl[c]) fs.write(cnl[c] + '\n') pass print('c :',c,'cnl[c] :', cnl[c]) return c def cc(file): #count characters f00 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'other characters') hs0 = { 3 : '·、【】!@¥—~……();‘’:“”《》,。?、', 4 : ''' `~!@#$%^&*()_+-={}|:%"<>?[]\;',./×''' } hs = { 1 : 0,#中文 2 : 0,#english letter 3 : 0,#中文标点符号 4 : 0,#english punctuation marks 5 : 0,#数字 6 : 0,#行数 7 : 0,#中文字数占总字符数的比例 } string = file.read() with open(f00, 'w') as f: for i in string: if 19968 <= ord(i) <= 40869: hs[1]+=1 elif 65 <= ord(i) <=90 or 97 <= ord(i) <= 122: hs[2]+=1 elif i in hs0[3]: hs[3]+=1 elif i in hs0[4]: hs[4]+=1 elif 48 <= ord(i) <= 57: hs[5]+=1 elif i == '\n': hs[6]+=1 else: f.write(i)#检查是否有其他特殊字符,应该是没有的。如果有,可能乱码了 hs[7] = hs[1]/(len(string)+1)#len+1避免报错ZeroDivisionError: division by zero file.seek(0) l = ['中文', 'english letter', '中文标点符号', 'english punctuation marks', '数字', '行数', '中文字数占总字符数的比例'] for i in range(7): if i == 6: print('{} : {:.2%}'.format(l[i], hs[i+1])) else: print('{} : {:.2f}万'.format(l[i], hs[i+1]/10000)) def main(): start = time.perf_counter() getCatalog() missingChapter() saveChapter() modify() end = time.perf_counter() print('total time consuming : ',(end - start)//60, 'minutes',(end - start)%60, 'seconds') main()

需要改进的地方:

逐一访问各章节非常耗时没有完全避开广告信息笔趣阁网页内容经常缺失,正则表达式未完全适应所有情况章节序号为中文时,无法匹配
最新回复(0)