#!/usr/bin/env python
# encoding: utf-8
from functools import wraps
import requests
from lxml import html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import random
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
class FakeChromeUA:
os_type = [
'(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
'(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
@classmethod
def get_ua(cls):
return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36',
'(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36']
)
HEADERS = {
'User-Agent': FakeChromeUA.get_ua(),
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive'
}
URL="https://www.taobao.com/"
MAX_RETRY=3 #最大尝试次数
XPATH="//div[@class='cat-title']" #需要检查的xpath
def request(url):
session=requests.Session()
req=session.get(url,headers=HEADERS)
if req.status_code==requests.codes.ok:
req.encoding=req.apparent_encoding
return req.text
return None
def getdriver(url):
co=Options()
prefs = {
'profile.default_content_setting_values': {
'images': 2
}
}
co.add_experimental_option('prefs', prefs)
co.add_argument('lang=zh_CN.UTF-8')
co.add_argument('--headless')
co.add_argument('--nogpu')
driver=webdriver.Chrome(chrome_options=co)
driver.get(url)
time.sleep(3)
source=driver.page_source
time.sleep(3)
print("关闭chrome浏览器")
driver.close()
return source
def newdecorator(url,retry,check_xpath):
def decorator(func):
@wraps(func)
def log(*args,**kwargs):
global retry
retry=1
try:
while retry<3:
source=request(url)
if source:
print("开启requests模块")
print("=" * 50)
root=html.fromstring(source)
nodelist=root.xpath(check_xpath)
if nodelist:
return func(source)
else:
print("该网站为ajax生成的网页,开始启用chrome模式")
try:
source=getdriver(url)
except:
print("获取内容失败,再次启动谷歌浏览器")
source = getdriver(url)
break
else:
retry+=1
return func(source)
except Exception as e:
print(e.args)
return log
return decorator
@newdecorator(url=URL,retry=MAX_RETRY,check_xpath=XPATH)
def getitem(source):
root=html.fromstring(source)
nodes=root.xpath(XPATH)
print("="*50)
print("开始解析网页")
print("=" * 50)
print("获取商品分类")
for item in nodes:
name=item.xpath(".//text()")
print(name[1])
if __name__ == '__main__':
getitem()
转载于:https://www.cnblogs.com/c-x-a/p/9106064.html