爬虫技术：爬取今日头条数据-崔庆才思路

mac2022-06-30 91

爬虫技术：爬取今日头条数据-崔庆才思路

一. urllib库中将字典转化为url的查询参数

二.请求异常的处理，以及内部的判断逻辑

　　1.返回的json数据为空：原因是requests的请求对象没有加请求头和cookies

　　2.正常获得数据

import requests from urllib.parse import urlencode def get_page_index(): data = { "aid": "24", "app_name": "web_search", "offset": "0", "format": "json", "keyword": "街拍", "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) response = requests.get(url,headers=headers,cookies=cookies) if response.status_code == 200: print(response.content.decode("utf-8")) if __name__ == '__main__': headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"} cookies = {"Cookie": "tt_webid=6719272225969096196; WEATHER_CITY=北京; tt_webid=6719272225969096196; csrftoken=b28e41c77cd4f268af393de7d3e9d47a; UM_distinctid=16c4159a9ae7e3-04be696c185f6c-3f385c06-1fa400-16c4159a9afa94; CNZZDATA1259612802=1303724616-1564459685-https%3A%2F%2Fwww.toutiao.com%2F|1564459685; WIN_WH=1536_710; s_v_web_id=e588fb5c6570d79a16b67e84decce3d8; __tasessionId=y99fyeyyt1568882979794"} get_page_index() # 结果： {"count":20,"return_count":20,"query_id":"6537385837821170952","has_more":1,"request_id":"20190919170856010017031149086E0FC","search_id":"20190919170856010017031149086E0FC","cur_ts":1568884136,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","tokens":["街拍"],"log_pb":{"impr_id":"20190919170856010017031149086E0FC"},"data":[{"ala_src":"user","app_info":{"query_type":"AladdinRpcQueryType"},"cell_type。。。。。。。。。。。。省略

四：

图片地址位置定位：要现请求这个网址，获得相应解析出对应的imag_url

　　解析报错：SyntaxError: Non-UTF-8 code starting with '\xe5'，在程序上方添加 # -*- coding:utf-8 -*-

　　json中的键值对，期望用双引号而不是单引号。原因是正则错误：

五：完整的代码

# -*- coding:utf-8 -*- import re import requests from urllib.parse import urlencode import os from requests.exceptions import RequestException import json import pymongo from bs4 import BeautifulSoup from config import * from hashlib import md5 # 建立数据库的链接对象 client = pymongo.MongoClient(MONGO_URL) # 数据库的名称 db = client[MONGO_DB] def get_page_index(offset, keyword): data = { "aid": "24", "app_name": "web_search", "offset": offset, "format": "json", "keyword": keyword, "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) # 知识点1：urlencode()将字典数据，{"a":"1","b":"2"}----> a=1,b=2 try: response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content.decode() return content return None except RequestException: # 知识点2：所有请求异常类的捕获 print("请求出错") return None def parse_page_index(html): """构造生成器即可，或者这个函数的返回值是一个列表""" data = json.loads(html) if data and "data" in data.keys(): for item in data.get("data"): # 知识点3:字典获取键的值的get方法 if "article_url" in item.keys(): url = item.get("article_url") yield url def get_page_detial(url): try: # 知识点4：请求的异常处理方式 response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content.decode() return content return None except RequestException: print("请求出错") return None def parse_page_detial(html, url): """正则获取gallery""" soup = BeautifulSoup(html, "lxml") title = soup.select("title")[0].get_text() # 知识点5：soup的选择器使用 images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S) # 知识点6：正则模式re.S模式 result = re.search(images_pattern, html) if result: ret = result.group(1) # {\"count\":11,\"sub_images\":[{\"url\":\"http:\\\u002F\\\u002Fp3.pstatp.com\\...} # 在进行loads转换时，报错json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 因此需要替换\为空字符串 ret = ret.replace("\\", "") ret = ret.replace("u002F", "/") data = json.loads(ret) if data and 'sub_images' in data.keys(): sub_images = data.get("sub_images") images = [item.get("url") for item in sub_images] for img in images: download(img) return { "title": title, "images": images, "url": url } def save_to_mongo(ret_dict): if db[MONGO_TABLE].insert(ret_dict): # 知识点8：mongodb数据库的链接，配置文件方式传入 print("插入数据到数据库成功", ret_dict["title"]) return True return False def download(url): print("正在下载图片",url) try: response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content saveimg(content) return None except RequestException: print("请求出错") return None def saveimg(content): file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg") # 知识点9：运用md5进行去重，md5的简单回顾 if not os.path.exists(file_path): # 知识点10:os方法的使用 with open(file_path,"wb") as f: f.write(content) def main(): for offset in range(START_PAGE,END_PAGE,20): keyword = "街拍" html = get_page_index(offset, keyword) if html: for url in parse_page_index(html): html = get_page_detial(url) if html: ret = parse_page_detial(html, url) if ret: save_to_mongo(ret) if __name__ == '__main__': headers = { "User-Agent": "xx"} cookies = { "Cookie": "xx"} main()试运行爬取所有的街拍：报错json.decoder.JSONDecodeError，因此代码还得进行优化，排除异常。

六：知识点总结

urlencode是从urllib.parse中的一个方法：将字典变成url的查询参数

from urllib.parse import urlencode data = {"a":1,"b":2} url = "http:www.baidu.com/?" print(url + urlencode(data))http:www.baidu.com/?a=1&b=2

md5加密的不一致问题

一直以来都是用 hashlib中的md5进行加密，md5.update(二进制) md5.hexdigest()，可以会出现对相同的字符串进行加密，加密结果不一样的问题，看来是update方法造成的。

from hashlib import md5 fp = md5() demo = ["1","1","3","3"] for i in demo: fp.update(i.encode("utf-8")) print(fp.hexdigest()) # 结果： c4ca4238a0b923820dcc509a6f75849b 6512bd43d9caa6e02c990b0a82652dca 73278a4a86960eeb576a8fd4c9ec6997 fd06b8ea02fe5b1c2496fe1700e9d16c# 原因是md5.updage（）会将上次的串和这次的进行拼接，1,11,113，1133，每次加密的串都不同，结果肯定不同。所以每加密之前，都对md5进行实例化，才能保证相同内容加密结果一样，因为以前这个方法都是放在函数里面的，每次调用函数，都会重新实例化md5,因此不存在问题。循环就存在问题上面代码可以改为

from hashlib import md5

demo = ["1","1","3","3"]

for i in demo: 　　fp = md5() 　　fp.update(i.encode("utf-8")) 　　print(fp.hexdigest())

# 结果为：

c4ca4238a0b923820dcc509a6f75849bc4ca4238a0b923820dcc509a6f75849beccbc87e4b5ce2fe28308fd9f2a7baf3eccbc87e4b5ce2fe28308fd9f2a7baf3

for i in demo: print(md5(i.encode("utf-8")).hexdigest()) # 这种方式行，因为每次都重新实例化了 # 结果 c4ca4238a0b923820dcc509a6f75849b c4ca4238a0b923820dcc509a6f75849b eccbc87e4b5ce2fe28308fd9f2a7baf3 eccbc87e4b5ce2fe28308fd9f2a7baf3# 看源码也没有理解update真正意图，只是说用字符串更新对象。后续解决

os模块的使用方法

os的基本用法 1. os.getcwd()：查看当前所在路径。 current_path = os.getcwd() print(current_path) # 运行结果 C:\Users\Administrator\AppData\Roaming\Sublime Text 3\Packages\User 2. os.listdir(path):列举目录下的所有文件。返回的是列表类型。 dir_list = os.listdir(current) print(dir_list) # 运行结果 ['11.py', 'cuiqingcai.py', 'Localization.sublime-settings', 'oscrypto-ca-bundle.crt', 'Package Control.cache', 'Package Control.last-run', 'Package Control.merged-ca-bundle', 'Package Control.sublime-settings', 'Package Control.user-ca-bundle', 'Preferences.sublime-settings', 'reids分布式锁', 'sha1.py', 'test.py', 'untitled.sublime-build']具体用法见：https://www.cnblogs.com/yufeihlf/p/6179547.html

Mongo数据库与python的交互

import pyongo # 交互模块 # 第一步，建立客户端，链接mogo服务器,ip和port from pymongo import MongoClient client = MongoClient(host,port) collection = client[db名][集合名] # db名--相当于数据库的名称集合名---相当于表名称 # 第二步，添加数据 ret = collection.insert_one({"name":"test10010","age":33}) print(ret) # 通过返回的数据进行判断if ret:　　xxxx示例： import pymongoclient = pymongo.MongoClient("localhost")# 链接指定数据库中的指定集合，不存在就新建collection = client["test"]["new"]ret = collection.insert({"new":"python"})print(ret)# 结果：5d85ce978a808f42364b045c插入前：插入后：

正则表达式知识点回顾：

import re pattern = re.compile("匹配规则", re.S) re.compile（）返回的就是一个匹配规则。陪着search find match等方法使用 import re a = """aaaaaaabbbbbbbb 111111ccccc""" pattern1 = re.compile("aaaaaaa(.*?)cccc") print(re.search(pattern1,a)) # None re.S可以匹配全部文本，不担心换行问题 pattern2 = pattern1 = re.compile("aaaaaaa(.*?)cccc",re.S) print(re.search(pattern2,a)) # <re.Match object; span=(0, 26), match='aaaaaaabbbbbbbb\n111111cccc'>

posted on 2019-09-19 17:18 张京墨阅读( ...) 评论( ...) 编辑收藏

转载于:https://www.cnblogs.com/meloncodezhang/p/11551139.html

最新回复(0)