最近学习了一下python 写了一小端代码 练习一下
一、获取网页数据(html)
url:网址 headers:请求头部信息 (见下图)
找了一个 常用的 User-Agent:
headers = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)', ]一般我们通过
''' 获取html页面 ''' def get_html(url,headers): req = urllib2.Request(url) header = random.choice(headers) req.add_header('User-Agent', header) req.add_header('Sec-Fetch-User', '?1') req.add_header('Sec-Fetch-Site', 'none') req.add_header('Sec-Fetch-Mode', 'navigate') html = urllib2.urlopen(req).read() return html二、将获取的html转化为可用的数据 (使用的是 xpath)
google xpath插件可以安装一下 方便你使用xpath
soup.xpath('//div[@class="witkey-item-top "]') 里面是你要截取的数据 这个根据你的需要改变哦!
''' 获取html页面内数据 ''' def get_page_data(html): soup = etree.HTML(html) div_list = soup.xpath('//div[@class="class"]') with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f: for div in div_list: title = div.xpath('.//div[@class="class"]/text()')[0] f.write('{}\n'.format(title)); '''三、创建csv文件 一定要先创建csv文件再去写入数据
''' 生成csv ''' def creat_csv(): csv_headers = ['标题'] with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f: f_csv = csv.writer(f) f_csv.writerow(csv_headers) # f_csv.writerows() f.close()三、记得引入用的模块
import urllib2 import random import csv import sys reload(sys) sys.setdefaultencoding( "utf-8" ) from lxml import etree四、还是把完整代码写一下吧 里面的参数需要替换成你自己的哦
# -*- coding:utf-8 -*- import urllib2 import random import csv import sys reload(sys) sys.setdefaultencoding( "utf-8" ) from lxml import etree headers = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)', ] ''' 获取html页面 ''' def get_html(url,headers): req = urllib2.Request(url) header = random.choice(headers) req.add_header('User-Agent', header) req.add_header('Sec-Fetch-User', '?1') req.add_header('Sec-Fetch-Site', 'none') req.add_header('Sec-Fetch-Mode', 'navigate') html = urllib2.urlopen(req).read() return html ''' 生成csv ''' def creat_csv(): csv_headers = ['名称', '网址'] with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f: f_csv = csv.writer(f) f_csv.writerow(csv_headers) # f_csv.writerows() f.close() ''' 获取html页面内数据 ''' def get_page_data(html): soup = etree.HTML(html) div_list = soup.xpath('//div[@class="你的类属性"]') with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f: for div in div_list: title = div.xpath('.//div[@class="你的类属性"]/a/text()')[0] link= div.xpath('.//div[@class="你的类属性"]/a/@herf')[0] f.write('{},{}\n'.format(title, link)); ''' 主函数 ''' def main(): num = input('请输入你要爬取的页数'); keyword = raw_input('请输入你要爬取关键词'); keyword = urllib2.quote(keyword) for i in range(int(num)): page = (i-1)*5 + i*65 if page < 0: page = 0 url = '你的地址?page={}&key={}'.format(page,keyword) html = get_html(url,headers) get_page_data(html) creat_csv() #创建csv main() #主函数五、如果有好的方法、逻辑 ,欢迎留言 沟通 指教