Python爬虫第一步

mac2022-06-30 22

这只是记录一下自己学习爬虫的过程，可能少了些章法。我使用过的是Python3.x版本，IDE为Pycharm。

这里贴出代码集合，这一份代码也是以防自己以后忘记了什么，方便查阅。

import requests #以不同的请求方式获取response r = requests.get('https://api.github.com/events') r = requests.post('http://httpbin.org/post', data = {'key':'value'}) r = requests.put('http://httpbin.org/put', data = {'key':'value'}) r = requests.delete('http://httpbin.org/delete') r = requests.head('http://httpbin.org/get') r = requests.options('http://httpbin.org/get') payload = {'key1': 'value1', 'key2': 'value2'} r = requests.get('http://httpbin.org/get', params=payload) #params参数接收一个dict来添加query string #以None为value的key不会添加到query string里 #pass a list of items as value payload = {'key1': 'value1', 'key2': ['value2', 'value3']} r = requests.get('http://httpbin.org/get', params=payload) print(r.url) #Output：http://httpbin.org/get?key1=value1&key2=value2&key2=value3 r.text #The text encoding guessed by Requests r.encoding #find out what encoding Requests is using #and you can change it whenever you wanna work out what the encoding of the content will be r.content #to find the encoding r.encoding #set encoding as you need r.text #get r.text with correct encoding r.content #accesss the response body as bytes(binary response content) r.json() #It should be noted that the success of the call to r.json() #does not indicate the success of the response. #To check that a request is successful, #use r.raise_for_status() or check r.status_code is what you expect. r = requests.get('https://api.github.com/events', stream=True) #if you'd like to get the raw socket response,make sure you have set #the parameter stream as True r.raw #get raw socket response r.raw.read(10) #to save what is being streamed to a file with open(filename, 'wb') as fd: for chunk in r.iter_content(chunk_size): fd.write(chunk) #we didn't specify our user-agent in the previous example #If you'd like to add HTTP headers to a request, #simply pass in a dict to the headers parameter url = 'https://api.github.com/some/endpoint' headers = {'user-agent': 'my-app/0.0.1'} r = requests.get(url, headers=headers) #Note: All header values must be a string, bytestring, or unicode. # While permitted, it's advised to avoid passing unicode header values. #you want to send some form-encoded data payload = {'key1': 'value1', 'key2': 'value2'} #the form r = requests.post("http://httpbin.org/post", data=payload) #set data parameter with the a dict you defined >>>print(r.text) { ... "form": { "key2": "value2", "key1": "value1" }, ... } # the GitHub API v3 accepts JSON-Encoded POST/PATCH data import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} r = requests.post(url, data=json.dumps(payload)) #爬取一张照片 from urllib import request url='http://imgpoobbs.b0.upaiyun.com/uploadfile/photo/2016/8/201608051206091841435218.jpg!photo.middle.jpg' Request = request.urlopen(url)#发出请求 Response = Request.read()#获取返回结果 f = open('1.png','wb')#创建一个图片文件 f.write(Response)#把Response写入文件f中 f.close()#关闭文件 #urlopen的data参数 import urllib.request import urllib.parse data=bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8') #bytes()方法转字节流，第一个参数为string，这里用urllib.parse.urlencode() #把字典转换为string，第二个参数为编码方式 response = urllib.request.urlopen('http://www.python.org',data=data) #添加附加参数data,传递了data参数则请求方式为POST #print(type(response)) print(response.read()) #urlopen的timeout参数 import urllib.request import urllib.error import socket try:#（通过try...except..可以使一个页面如果长时间未响应就跳过它的抓取） response=urllib.request.urlopen('http://httpbin.org/get',timeout=0.1) #设置超时timeout参数，当到达参数时间服务器还未响应，则会抛出URLError except urllib.error.URLError as e : if isinstance(e.reason,socket.timeout): print('TIME OUT') #urllib.request.Request的格式 class urllib.request.Request(url, data=None, headers={}, origin_ req_host=None, unverifiable=False, method=None) #headers={}参数为请求头，请求头最常用的用法就是通过修改 User-Agent 来伪装浏览器， #默认的 UserAgent 是 Python-urllib ，你可以通过修改它来伪装浏览器， #比如要伪装火狐浏览器，你可以把它设置为 Mozilla/5.0 (X11; U; Linux i686) #Gecko/20071127 Firefox/2.0.0.11 #urllib.request.Request的使用 from urllib import request,parse url='http://httpbin.org/post' headers={'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)', 'Host':'httpbin.org'} dict={'name':'Germey'} data=bytes(parse.urlencode(dict),encoding='utf8') req=request.Request(url=url,data=data,headers=headers,method='POST') #request.Request（）可以设置比urlopen多的参数 response=request.urlopen(req)#再通过urlopen（）来发送请求，获取响应 print(response.read().decode('utf-8')) #Handler import urllib.request auth_handler=urllib.request.HTTPBasicAuthHandler() #实例化一个HTTPBasicAuthHandler对象 auth_handler.add_password(realm='PDQ Application', uri='http://mahler:8092/site-updates.py', user='klem', passwd='kadidd!ehopper') #给这个对象添加进去用户名和密码，相当于建立了一个处理认证的处理器 opener=urllib.request.build_opener(auth_handler) #build_opener方法利用这个处理器构建一个Opener，则这个Opener在发送请求的时候 #就具备认证功能了 urllib.request.install_opener(opener) #完成认证 urllib.request.urlopen('http://www.example.com/login.html') #添加代理 import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http':'http://218.202.111.10:80', 'https':'http://180.250.163.34:8888' })#参数为一个字典，key为协议类型，value是代理链接 opener = urllib.request.build_opener(proxy_handler) #给这个Handler构建一个Opener response = opener.open('https://www.baidu.com') #发送请求 print(response.read()) #1.Handler 2.bulid.opener 3.open #异常处理 #error.URLError from urllib import request,error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.URLError as e:#由request产生的异常都由URLError捕获 print(e.reason)#URLError的reason属性返回错误原因 #error.HTTPError from urllib import request,error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason,e.code,e.headers)#三个属性，code为错误代码，headers为响应头 except error.URLError as e:#URLError为HTTPError的父类，先捕获子类的错误 #如果非HTTPError再捕获父类URLError的错误 print(e.reason) else: print('Request Successfully') from urllib import request,error import socket try: response = request.urlopen('http://www.baidu.com',timeout=0.01) except error.URLError as e: print(type(e.reason)) if isinstance(e.reason,socket.timeout): print('TIME OUT') #Output：<class 'socket.timeout'> # reason不一定都返回字符串，这里e.reason为一个socket.timeout对象，而非一个字符串 # TIME OUT #解析链接 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') #urlparse的API：urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True) print(type(result),result) #Output：<class 'urllib.parse.ParseResult'> #ParseResult为一个元祖 #ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment') #标准链接格式scheme://netloc/path;parameters?query#fragment #urlunparse from urllib.parse import urlunparse data=['http','www.baidu.com','index.html','user','a=6','comment'] print(urlunparse(data))#参数为一个有6个参数(即标准链接格式的六部分)的Iterable #urlsplit from urllib.parse import urlsplit result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment') print(result) #Output：SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment') #和urlparse一样只不过不返回params，而把它归为path #urlunsplit同urlunparse，只不过它只用传5个参数（无params） #urljoin的拼接 from urllib.parse import urljoin print(urljoin('http://www.baidu.com', 'FAQ.html')) print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'http://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/' 'FAQ.html?question=2')) print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php')) print(urljoin('http://www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu,com', '?categoty=2#comment')) print(urljoin('www.baidu.com#comment', '?category=2')) #给出两个链接，第一个为base_url，它会提供scheme,netloc,path三项内容 #对于第二个链接中没有这三项内容的就用base_url的来补充 #Output：http://www.baidu.com/FAQ.html #https://cuiqingcai.com/FAQ.html #http://cuiqingcai.com/FAQ.html #https://cuiqingcai.com/FAQ.html?question=2 #https://cuiqingcai.com/index.php #http://www.baidu.com?category=2#comment #www.baidu,com?categoty=2#comment #www.baidu.com?category=2 #robots协议 from urllib.robotparser import RobotFileParser rp=RobotFileParser()#创建类实例 rp.set_url('http://www.jianshu.com/robots.txt') rp.read()#读取robots.txt文件并进行分析，一定要调用这个方法，否则不读取的！ print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))#判断是否can_fetch print(rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))

最开始自己看HTTP协议的一些基本知识，这里也不叙述了。

学习内容参考Python3WebSpider，网上应该也找得到。

如果有需要联系邮箱dadozsama@163.com。

转载于:https://www.cnblogs.com/Tessiedoupu/p/5805951.html

最新回复(0)