直接上代码
#!/usr/bin/env Python #coding=UTF-8 import time import pymongo from pyquery import PyQuery as pq from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import random#设置反爬等待时间 client = pymongo.MongoClient(host='localhost',port=27017)#链接mongodb db=client.taobao.table_name1#创建表table_name browser=webdriver.Chrome()#定义browser wait=WebDriverWait(browser,10)#设置等待显式时间 def search(keyword):#负责登录淘宝网、输入搜索关键词 try:#尝试登录搜索,失败则重来 browser.get('https://login.taobao.com/member/login.jhtml')#打开淘宝网 print('请扫描二维码登录淘宝!') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))#选中输入框 submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))#先选中提交按钮 input.send_keys(keyword)#输入关键词 submit.click()#点击提交按钮 a=random.randint(1,3)#设置随机等待时间 for k in range(a): print('反爬等待',a-k,'s') time.sleep(1) total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) print('login_success!') except TimeoutException as error: print(error) search(keyword) def next_page(pagenumber):#翻页用 try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) input.clear() a=random.randint(1,3) for k in range(a): print('反爬等待',a-k,'s') time.sleep(1) input.send_keys(pagenumber) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(pagenumber))) print('翻页成功') product() except: print('触发了反爬机制,错误啦') def save_to_mongo(result): try: if db[MONGO_TABLE].insert(result): print('存储到Mongo成功') except Exception: print('存储到Mongo失败',result) def product(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) html=browser.page_source doc=pq(html) a=random.randint(1,3) for k in range(a): print('反爬等待',a-k,'s') time.sleep(1) items=doc('#mainsrp-itemlist .items .item').items() for item in items: product={ 'image:':item.find('.pic img').attr('data-src'), 'deal:':item.find('.deal-cnt').text()[:-3], 'location':item.find('.location').text(), 'price:':item.find('strong').text(), 'shop:':item.find('.shop').text(), 'title:':item.find('.title').text()} print('获取数据成功,准备写入MongoDB') save_to_mongo(product) def main(): keyword='零食' search(keyword) for i in range(2,101): product() a=random.randint(1,3) for k in range(a): print('反爬等待',a-k,'s') time.sleep(1) print('准备翻页到第',i,'页') next_page(i) print('ALL DONE!') browser.close() if __name__=="__main__": main()