文章目录
前言爬虫概要解析代码示例数据存储
Github地址:https://github.com/pasca520/Python3SpiderSet
前言
关于整理日常练习的一些爬虫小练习,可用作学习使用。
爬取项目以学习为主,尽可能使用更多的模块进行练习,而不是最优解。
爬虫概要
示例python 库
爬取模块request解析模块BeautifulSoup存储类型list(方便存入数据库)
解析
BeautifulSoup参数我整理的一篇文章:https://blog.csdn.net/qinglianchen0851/article/details/102860741
代码示例
import requests
from requests.exceptions
import ReadTimeout, ConnectionError, RequestException
from bs4
import BeautifulSoup
def get_page
(url
):
headers
= {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://maoyan.com/board',
}
try:
response
= requests.get
(url
=url, headers
=headers
).text
return response
except ReadTimeout:
print
('Timeout')
except ConnectionError:
print
('Connect error')
except RequestException:
print
('Error')
def parse_page
(html
):
soup
= BeautifulSoup
(html,
'lxml')
grid
= soup.find
(name
="ol", attrs
={"class": "grid_view"})
movie_list
= grid.find_all
("li")
for movie
in movie_list:
rank
= movie.find
(name
="em").getText
()
name
= movie.find
(name
="span", attrs
={"class": "title"}).getText
()
rating_num
= movie.find
(name
="span", attrs
={"class": "rating_num"}).getText
()
bd
= movie.find
(name
="p").getText
().strip
().replace
(' ',
'\n').replace
('...\n ',
'...\n').replace
(' / ',
'\n').split
('\n')
if len
(bd
) == 4:
bd.insert
(1,
'没爬到')
inq
= movie.find
(name
="span", attrs
={"class": "inq"})
if not inq:
inq
= "暂无"
else:
inq
= inq.getText
()
douBanDict
['rank'] = rank
douBanDict
['name'] = name
douBanDict
['director'] = bd
[0
]
douBanDict
['actor'] = bd
[1
]
douBanDict
['release_time'] = bd
[2
].strip
()
douBanDict
['country'] = bd
[3
]
douBanDict
['movie_types'] = bd
[4
]
douBanDict
['rating_num'] = rating_num
douBanDict
['inq'] = inq
douBanList.append
(str
(douBanDict
))
return douBanList
if __name__
== '__main__':
douBanList
= []
douBanDict
= {}
for start
in range
(0, 250, 25
):
url
= 'https://movie.douban.com/top250?start={}&filter='.format
(start
)
html
= get_page
(url
)
douBanList
= parse_page
(html
)
print
(douBanList
)
数据存储
直接是列表格式,同时包含各个电影信息的字典。
done!