爬取网站的流程:
确定网站哪个url是数据的来源简要分析网站结构,查看数据在哪里查看是否有分页,解决分页问题发送请求,查看response.text里面是否有我们想要的数据如果有数据,提取,保存
注意事项:
刚开始做爬虫项目,先不要用类做,只需要关注数据的来源等问题的解决,不要关注封装结构的处理
一、xpath应用
(一)扇贝单词项目
import requests
,re
from lxml
import etree
base_url
= 'https://www.shanbay.com/wordlist/110521/232414/?page=%s'
headers
= {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_value(value
):
if value
:
return value
[0]
return ''
result
= []
for i
in range(1,4):
response
= requests
.get
(base_url
%i
,headers
=headers
)
html
= etree
.HTML
(response
.text
)
tr_list
= html
.xpath
('//tbody/tr[@class="row"]')
for tr
in tr_list
:
item
= {}
word
= get_value
(tr
.xpath
('.//td[@class="span2"]/strong/text()'))
mean
= get_value
(tr
.xpath
('.//td[@class="span10"]/text()'))
item
[word
] = mean
result
.append
(item
)
print(result
)
封装
import requests
,json
from lxml
import etree
class Shanbei(object):
def __init__(self
,url
,headers
):
self
.url
= url
self
.headers
= headers
self
.result
= []
self
.word_mean
()
self
.save_data
()
def get_value(self
,value
):
if value
:
return value
[0]
return ''
def word_mean(self
):
for i
in range(1, 4):
response
= requests
.get
(self
.url
% i
, headers
=headers
)
html
= etree
.HTML
(response
.text
)
tr_list
= html
.xpath
('//tbody/tr[@class="row"]')
for tr
in tr_list
:
item
= {}
word
= self
.get_value
(
tr
.xpath
('.//td[@class="span2"]/strong/text()'))
mean
= self
.get_value
(tr
.xpath
('.//td[@class="span10"]/text()'))
item
[word
] = mean
self
.result
.append
(item
)
def save_data(self
):
with open('shanbei_word.json','w',encoding
='utf-8') as fp
:
json
.dump
(self
.result
,fp
)
if __name__
== '__main__':
base_url
= 'https://www.shanbay.com/wordlist/110521/232414/?page=%s'
headers
= {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
Shanbei
(base_url
,headers
)
with open('shanbei_word.json','r') as fp
:
result
= json
.load
(fp
)
print(result
)
(二)网易云音乐项目
可迭代对象:有__iter__属性的对象。
迭代器:有__next__属性的对象。
两个可以转换吗?
iter(可迭代对象)---->返回值为迭代器
可迭代对象都有哪些?
listdicttuplestrbytesset迭代器生成器文件流
打印一个文件,同时输出行号
fp
= open('shanbei_word.py','r',encoding
='utf-8')
print(fp
)
for i
,content
in enumerate(fp
,1):
print(i
,content
)
代码
import requests
from lxml
import etree
headers
= {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_xpath(url
):
response
= requests
.get
(url
,headers
=headers
)
return etree
.HTML
(response
.text
)
def get_info(url
,item
):
html
= get_xpath
(url
)
introduce_list
= html
.xpath
('//div[@class="n-artdesc"]/p/text()')
introduce
= ''.join
(introduce_list
)
item
['introduce'] = introduce
result
.append
(item
)
def get_single(url
):
html
= get_xpath
(url
)
single_names
= html
.xpath
('//ul[@id="m-artist-box"]/li/p/a[1]/text()|//ul[@id="m-artist-box"]/li/a[1]/text()')
single_urls
= html
.xpath
('//ul[@id="m-artist-box"]/li/p/a[1]/@href|//ul[@id="m-artist-box"]/li/a[1]/@href')
for i
,name
in enumerate(single_names
):
item
= {}
item
['name'] = name
item
['url'] = 'https://music.163.com' + single_urls
[i
].replace
(' ','')
url
= item
['url'].replace
('?','/desc?')
get_info
(url
,item
)
def get_type_page(url
):
html
= get_xpath
(url
)
nametype_url_list
= html
.xpath
('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
for one
in nametype_url_list
:
url
= 'https://music.163.com' + one
get_single
(url
)
base_url
= 'https://music.163.com/discover/artist'
def get_type():
html
= get_xpath
(base_url
)
localtype_url_list
= html
.xpath
('//ul[@class="nav f-cb"]/li/a[contains(@href,"id")]/@href')
for one
in localtype_url_list
:
url
= 'https://music.163.com'+ one
get_type_page
(url
)
if __name__
== '__main__':
result
= []
get_type
()
print(result
)
封装
import requests
,json
from lxml
import etree
class Music(object):
def __init__(self
,base_url
):
self
.base_url
= base_url
self
.headers
= {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self
.result
= []
self
.get_type
()
def get_xpath(self
,url
):
response
= requests
.get
(url
, headers
=self
.headers
)
return etree
.HTML
(response
.text
)
def get_type(self
):
html
= self
.get_xpath
(self
.base_url
)
localtype_url_list
= html
.xpath
(
'//ul[@class="nav f-cb"]/li/a[contains(@href,"id")]/@href')
for one
in localtype_url_list
:
url
= 'https://music.163.com' + one
self
.get_type_page
(url
)
def get_type_page(self
,url
):
html
= self
.get_xpath
(url
)
nametype_url_list
= html
.xpath
(
'//ul[@id="initial-selector"]/li[position()>1]/a/@href')
for one
in nametype_url_list
:
url
= 'https://music.163.com' + one
self
.get_single
(url
)
def get_single(self
,url
):
html
= self
.get_xpath
(url
)
single_names
= html
.xpath
(
'//ul[@id="m-artist-box"]/li/p/a[1]/text()|//ul[@id="m-artist-box"]/li/a[1]/text()')
single_urls
= html
.xpath
(
'//ul[@id="m-artist-box"]/li/p/a[1]/@href|//ul[@id="m-artist-box"]/li/a[1]/@href')
for i
, name
in enumerate(single_names
):
item
= {}
item
['name'] = name
item
['url'] = 'https://music.163.com' + single_urls
[i
].replace
(' ','')
url
= item
['url'].replace
('?', '/desc?')
self
.get_info
(url
, item
)
def get_info(self
,url
, item
):
html
= self
.get_xpath
(url
)
introduce_list
= html
.xpath
('//div[@class="n-artdesc"]/p/text()')
introduce
= ''.join
(introduce_list
)
item
['introduce'] = introduce
self
.result
.append
(item
)
if __name__
== '__main__':
base_url
= 'https://music.163.com/discover/artist'
m
= Music
(base_url
)
with open('singer.json','w',encoding
='utf-8') as fp
:
json
.dump
(m
.result
,fp
)
二、反爬措施以及应对措施
反爬策略
1.通过user-agent客户端表示判断
解决办法:将user-agent封装到请求头中
2.通过访问频率判断
解决办法:设置爬取间隔
n
= random
.randint
(5)
time
.sleep
(n
)
3.查封ip
解决办法:设置代理ip
4.页面内容无法直接获取数据,页面都是js代码
解决办法:使用selenium+phantomjs可以获取页面数据
selenium+phantomjs
selenium:是web自动测试的工具。
phantomjs:是一个无界面的浏览器,所以它可以运行js代码,帮我们拿到页面数据。
它们配合使用就可以解决页面是js代码的数据获取问题
下载和安装 复制到anaconda的script目录下,然后执行
pip install selenium==2.48.0
为了使测试工具与浏览器交互,需要使用ChromeDriver
查看google版本号 选择大版本号对应,小版本号最接近的 解压后同样放在anaconda的script目录中
三、动态html页面的处理方法
(一)常见的页面技术
1.js
html是页面的骨架,css是页面的装饰,js是页面的灵魂。
2.jquery
jquery是一个库,可以使js代码更加简化。
3.ajax
是一种技术,web页面的异步请求。
4.DHTML
DHTML是Dynamic HTML的缩写,意思是动态的HTML。它并不是一门独立的语言,实际上任何可以实现页面动态改变的方法都可以成为DHTML。