今日头条的街拍下载实现 python

mac2022-06-30  62

from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep import requests import threading import os import re def gethtml(url): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') #上面是设置无界浏览器用的,不用打开浏览器也可以爬取数据! chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) #对chrome开启开发者模式 driver = webdriver.Chrome(options=chrome_options) # 设置无头浏览器,就是隐藏界面后台运行 driver.get(url) driver.refresh()# 有验证码,刷新一下,就好了 sleep(3) #等浏览器反应3秒钟,设置沉睡,否则加载不到我们想要的数据! soup = driver.page_source driver.close() driver.quit() return soup def parserurl(soup): tapm = re.compile('<div id="(.*?)".*?href="/group/(.*?)">') htmldata=re.findall(tapm,soup) return htmldata def geturls(html): temp = re.compile('<img src="(http.*?)"') pict = re.findall(temp, html) print(pict) return pict # def getpicurl(): # def getpict(): # pass def main(): url='https://www.toutiao.com/search/?keyword=街拍' url_1='https://www.toutiao.com/' html=gethtml(url) htmlurl=parserurl(html) data=[] filname=[] dataurl=[] print(dataurl) for i in range(len(htmlurl)): filname.append(htmlurl[i][0]) dataurl.append(url_1+'a'+htmlurl[i][1]) for j in range(len(filname)): pict=gethtml(dataurl[j]) urls=geturls(pict) a={filname[j]:urls} data.append(a) print(filname) #多线程开始 class A(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): for k in range(0, len(data), 2): os.chdir("E:\桌面\\222") os.mkdir(filname[k]) m = len(data[k][filname[k]]) for n in range(m): with open('E:/桌面/222/' + filname[k] + '/' + str(n + 1) + '.jpg', 'wb') as f: req = requests.get(data[k][filname[k]][n]) datapic = req.content f.write(datapic) f.close() class B(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): for k in range(1, len(data), 2): os.chdir("E:\桌面\\222") os.mkdir(filname[k]) m = len(data[k][filname[k]]) for n in range(m): with open('E:/桌面/222/' + filname[k] + '/' + str(n + 1) + '.jpg', 'wb') as f: req = requests.get(data[k][filname[k]][n]) datapic = req.content f.write(datapic) f.close() t1 = A() t1.start() t2 = B() t2.start() main()
最新回复(0)