第一个python脚本,提取笔趣阁小说网址的一篇小说,格式上还有点问题。
# -*- coding:UTF-8 -*- from bs4 import BeautifulSoup import requests ##第一步 获取数据 # url='https://www.biqukan.com/1_1094/5403177.html' url='https://www.biqukan.com/1_1094/5428081.html' def saveText(url): response = requests.get(url) #网页编码 :response.encoding #编码转为gbk response.encoding='gbk' text=response.text ##第二步 解析数据 soup=BeautifulSoup(text,'html.parser') # xiaoshuo=soup.prettify().split('<div class="showtxt" id="content">')[1].split(' </div>')[0].replace(' <br/>','').split(' <script>')[0].replace(' ','').replace('<br/>','') title='C:\\Users\\Administrator\\Desktop\\小说\\'+str(soup.title).replace('<title>','').replace('_一念永恒_修真小说_笔趣阁</title>','')+'.txt' print(title) #将网页内容写入本地 with open(title,mode='w',encoding='gbk') as f: try: f.write(xiaoshuo) except: pass # saveText(url) target='https://www.biqukan.com/1_1094/' resp=requests.get(url=target) resp.encoding='gbk' # print(resp.text) soup=BeautifulSoup(resp.text,'html.parser') # print(soup.prettify()) for link in soup.find_all('a') : if(str(link.get('href')).__contains__('html')): print('https://www.biqukan.com' + link.get('href')) saveText('https://www.biqukan.com'+link.get('href'))