requests
requests模块在处理爬虫更加高效,快捷。
基于request发起get请求
需求:爬取搜狗首页的数据
import requests
# 1.指定url
url =
'https://www.sogou.com/'
#2发起get请求:get方法会返回请求成功的响应对象
response = requests.get(url=
url)
#获取响应的数据:text可以获取响应对象中字符串形式的页面数据
page_data =
response.text
#print(page_data)
# 4进行持久化存储
with open(
'./sougou.html',
'w',encoding=
'utf-8')as fp:
fp.write(page_data)
response对象中其他重要属性
import requests
#指定url
url =
'https://www.sogou.com/'
#2发起get请求:get方法会返回请求成功的响应对象
response = requests.get(url=
url)
#获取响应的数据:text可以获取响应对象中字符串形式的页面数据
# page_data = response.content # content 获取的是二进制数据
# print(page_data)
#返回响应状态码
print(response.status_code)
# 获取响应头信息
print(response.headers)
# 获取请求指定的url
print(response.url)
request模块如何处理带参数的get请求
方式一 需求:指定一个词条,获取搜狗搜索结果所对应的页面数据
import requests
url =
'https://www.sogou.com/web?query=周杰伦&ie=utf8' # 不需要编码中文
response = requests.get(url=
url)
page_data =
response.text
# print(page_data)
with open(
'./zhou.html',
'w',encoding=
'utf-8')as fp:
fp.write(page_data)
方式二
import requests
url =
'https://www.sogou.com/web'
# 将参数封装到字典中
params =
{
'query':
'周杰伦',
'ie':
'utf-8'
}
response = requests.get(url=url,params=
params)
response.status_code
print(response.text)
自定义请求头信息
import requests
url =
'https://www.sogou.com/web'
# 将参数封装到字典中
params =
{
'query':
'周杰伦',
'ie':
'utf-8'
}
# 自定义请求头信息
headers=
{
'User-Agent':
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
response=requests.get(url=url,params=params,headers=
headers)
print(response.status_code)
基于request发起post请求
- 登录豆瓣 获取登陆成功之后的页面数据
import requests
# url = 'https://accounts.douban.com/login'
url =
"https://www.douban.com/accounts/login"
# 封装post请求的数据
data =
{
"sourse":
"movie",
"redir":
"https://movie.douban.com/",
"form_email":
"15027900535",
"form_password":
"bobo@15027900535",
"login":
"登录"
}
# 自定义请求头信息
headers=
{
'User-Agent':
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
#发起post请求
response=requests.post(url=url,data=data,headers=
headers)
# 获取页面数据
page_text =
response.text
# print(page_text)
# 持久化存储
with open(
'./douban.html',
'w',encoding=
'utf-8')as fp:
fp.write(page_text)
基于ajax的get请求
需求:抓取豆瓣电影上电影详情的数据
import requests
url =
'https://movie.douban.com/j/chart/top_list?'
# 封装ajax数据
params=
{
"type":
"24",
"interval_id":
"100:90",
"action":
"",
"start":
"0",
# 从哪里开始
"limit":
"10" # 获取多少条数据
}
# 自定义请求头信息
headers=
{
'User-Agent':
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
#发起post请求
response=requests.get(url=url,params=params,headers=
headers)
# 获取页面数据
page_text =
response.text
print(page_text)
with open('./douban1.html',
'w',encoding=
'GBK')as fp:
# 写入文件会出现乱码,而且是纯文本
fp.write(page_text)
print(
'ok')
基于ajax的post请求
需求:爬取肯德基
import requests
url =
'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data =
{
"cname":
'',
'pid':
'',
'keyword':
'上海',
'pageIndex':
'1',
'pageSize':
'10'
}
response = requests.post(url=url,data=
data)
print(response.text)
综合项目实战
- 需求:爬取搜狗知乎某一词条对应一定页码范围表示的页面数据
import requests
import os
# 创建一个文件夹
if not os.path.exists(
'./pages'):
os.mkdir('./pages')
word = input(
'enter a key-word:')
start_num = int(input(
'enter start num:').strip())
end_num = int(input(
'enter end num:').strip())
url =
'http://zhihu.sogou.com/zhihu?'
# 自定义请求头信息
headers=
{
'User-Agent':
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
for page
in range(start_num,end_num+1
):
params=
{
'query':word,
'ie':
'utf-8',
'page':page
}
response= requests.get(url=url,params=params,headers=
headers)
page_text =
response.text
file_name = word+str(page)+
'.html'
file_path=
'pages/'+
file_name
with open(file_path,'w',encoding=
'utf-8')as fp:
fp.write(page_text)
print(
'第%d页写入成功'%page)
转载于:https://www.cnblogs.com/yuliangkaiyue/p/9993095.html
相关资源:python requests模块下载