python访问网站

mac2022-06-30 26

#!/usr/bin/env python # encoding: utf-8 from functools import wraps import requests from lxml import html from selenium import webdriver from selenium.webdriver.chrome.options import Options import time import random first_num = random.randint(55, 62) third_num = random.randint(0, 3200) fourth_num = random.randint(0, 140) class FakeChromeUA: os_type = [ '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)', '(Macintosh; Intel Mac OS X 10_12_6)' ] chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num) @classmethod def get_ua(cls): return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36', '(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36'] ) HEADERS = { 'User-Agent': FakeChromeUA.get_ua(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } URL="https://www.taobao.com/" MAX_RETRY=3 #最大尝试次数 XPATH="//div[@class='cat-title']" #需要检查的xpath def request(url): session=requests.Session() req=session.get(url,headers=HEADERS) if req.status_code==requests.codes.ok: req.encoding=req.apparent_encoding return req.text return None def getdriver(url): co=Options() prefs = { 'profile.default_content_setting_values': { 'images': 2 } } co.add_experimental_option('prefs', prefs) co.add_argument('lang=zh_CN.UTF-8') co.add_argument('--headless') co.add_argument('--nogpu') driver=webdriver.Chrome(chrome_options=co) driver.get(url) time.sleep(3) source=driver.page_source time.sleep(3) print("关闭chrome浏览器") driver.close() return source def newdecorator(url,retry,check_xpath): def decorator(func): @wraps(func) def log(*args,**kwargs): global retry retry=1 try: while retry<3: source=request(url) if source: print("开启requests模块") print("=" * 50) root=html.fromstring(source) nodelist=root.xpath(check_xpath) if nodelist: return func(source) else: print("该网站为ajax生成的网页，开始启用chrome模式") try: source=getdriver(url) except: print("获取内容失败，再次启动谷歌浏览器") source = getdriver(url) break else: retry+=1 return func(source) except Exception as e: print(e.args) return log return decorator @newdecorator(url=URL,retry=MAX_RETRY,check_xpath=XPATH) def getitem(source): root=html.fromstring(source) nodes=root.xpath(XPATH) print("="*50) print("开始解析网页") print("=" * 50) print("获取商品分类") for item in nodes: name=item.xpath(".//text()") print(name[1]) if __name__ == '__main__': getitem()

转载于:https://www.cnblogs.com/c-x-a/p/9106064.html

最新回复(0)