初学python
这几天在学习python怎么用,就通过写下爬虫认识一下python。我爬的是起点中文网的小说。
爬虫基本步骤
示意图 ①首先模拟浏览器向目标网地址发送get请求,让网址返回html源码 ②然后通过正则表达式把小说的名字和小说的章节内容从html源码中提取出来 ③把提取出来的内容用replace函数清洗干净 ④把内容放入文件里面
代码如下
import requests
import re
url
= 'https://book.qidian.com/info/1015056750#Catalog'
response
= requests
.get
(url
)
html
= response
.content
.decode
('utf-8')
title
= re
.findall
(r
'<title>《(.*?)》',html
)
title
= str(title
)
title
= title
.replace
('[\'','')
title
= title
.replace
('\']','')
print(title
)
f
=open('%s.txt' %title
,'w',encoding
='utf-8')
dl
= re
.findall
(r
'<ul class="cf">(.*?)</ul>',html
,re
.S
)
chapter_title_list
= re
.findall
(r
'】(.*?)</a>',str(dl
))
chapter_url_list
= re
.findall
(r
'href="(.*?)"',str(dl
))
for chapter_title
,chapter_url
in zip(chapter_title_list
,chapter_url_list
):
chapter_url
= "http:%s" %chapter_url
chapter_response
= requests
.get
(chapter_url
)
chapter_html
= chapter_response
.content
.decode
('utf-8')
chapter_content
= re
.findall
(r
'<div class="read-content j_readContent">(.*?)</div>',chapter_html
,re
.S
)
chapter_content
= str(chapter_content
)
chapter_content
= chapter_content
.replace
(' ','')
chapter_content
= chapter_content
.replace
('\\u','')
chapter_content
= chapter_content
.replace
('\\n','')
chapter_content
= chapter_content
.replace
("['",'')
chapter_content
= chapter_content
.replace
('3000','')
chapter_content
= chapter_content
.replace
('<p>','')
f
.write
(chapter_title
)
f
.write
('\n')
f
.write
(chapter_content
)
f
.write
('\n')
print(chapter_title
)
运行结果