import urllib
.request
import re
import os
import time
start_page
= int(input('请输入起始页码-'))
end_page
= int(input('请输入结束页码-'))
url
= 'https://www.qiushibaike.com/pic/page/{}/'
headers
= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
for page
in range(start_page
, end_page
+ 1):
print('正在爬取第--%s--页......' % page
)
urlt
= url
.format(page
)
request
= urllib
.request
.Request
(url
=urlt
, headers
=headers
)
response
= urllib
.request
.urlopen
(request
)
print(response
)
content
= response
.read
().decode
('utf8')
pattern
= re
.compile(r
'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>', re
.S
)
ret
= pattern
.findall
(content
)
for info
in ret
:
image_url
= 'https:' + info
[0]
image_name
= info
[1]
filename
= image_name
+ '.' + image_url
.split
('.')[-1]
print('正在下载--%s--...' % filename
)
dirname
= 'qiutu'
filepath
= os
.path
.join
(dirname
, filename
)
urllib
.request
.urlretrieve
(image_url
, filepath
)
print('结束下载--%s--' % filename
)
time
.sleep
(2)
print('结束爬取第--%s--页...' % page
)
time
.sleep
(2)
# 单行模式 注意提取字符串里面内容有换行 re.S 否则返回可能为空
# .*? 不需要,匹配走 (.*?)保留
#注意右击检查中的代码可能与源码中的代码不同,
导致匹配错误,某个标签可能会有 /符号,而检查中没有 /符号
转载请注明原文地址: https://mac.8miu.com/read-494978.html