【spider】多线程爬虫

mac2024-08-01 70

多线程工作原理

多线程示意图

Queue（队列对象）

queue是python中的标准库，可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式

python下多线程的思考

对于资源，加锁是个重要的环节。Queue，是线程安全的，因此在满足使用条件下，建议使用队列

创建一个“队列”对象

pageQueue = Queue(10)

将一个值放入队列中

for page in range(1, 11): pageQueue.put(page)

将一个值从队列中取出

pageQueue.get()

队列Queue

Queue线程安全 queue是python中的标准库，可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式创建一个“队列”对象队列常用方法 put() get(block) empty() full() qsize()

队列锁与线程锁

import threading from queue import Queue dataQueue = Queue(100) exitFlag = False class MyThread(threading.Thread): def __init__(self,q): super().__init__() self.queue = q def run(self): super().run() global exitFlag while True: if exitFlag: print('++++++++++++++++++++++++++exit') break try: print('------------------------',self.queue.get(False)) self.queue.task_done() except: pass def main(): for i in range(100): dataQueue.put(i) threads = [] for i in range(5): thread = MyThread(dataQueue) threads.append(thread) thread.start() # 队列锁 # dataQueue.join() global exitFlag exitFlag = True print('exit ------------------------------------------------') # 线程锁 for t in threads: t.join() if __name__ == '__main__': main()

另一个实例爬去读书网站

import requests from bs4 import BeautifulSoup from queue import Queue import threading from threading import Lock url = 'https://www.dushu.com/book/1175_%d.html' task_queue = Queue(100) parse_queue = Queue(100) headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.9', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572418328; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572418390', 'Host':'www.dushu.com', 'Sec-Fetch-Mode':'navigate', 'Sec-Fetch-Site':'none', 'Sec-Fetch-User':'?1', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',} # 解析线程退出的标记 exit_flag = False # 相当于线程池 class CrawlThread(threading.Thread): def __init__(self, q_task:Queue,q_parse:Queue) -> None: super().__init__() self.q_task = q_task self.q_parse = q_parse def run(self) -> None: super().run() self.spider() # 一直干活 def spider(self): while True: if self.q_task.empty(): print('+++++++爬虫线程%s执行任务结束+++++++'%(threading.current_thread().getName())) break taskId = self.q_task.get() response = requests.get(url % (taskId), headers = headers) response.encoding = 'utf-8' html = response.text self.q_parse.put((html,taskId)) self.q_task.task_done() print('------爬虫线程：%s-----执行任务:%d-------' %(threading.current_thread().getName(),taskId)) # 专心爬虫 def crawl(): for i in range(1,101): task_queue.put(i) for i in range(5): t = CrawlThread(task_queue,parse_queue) t.start() class ParseThread(threading.Thread): def __init__(self,q_parse:Queue,lock:Lock,fp): super().__init__() self.q_parse = q_parse self.lock = lock self.fp = fp def run(self): super().run() self.parse() def parse(self): while True: if exit_flag: print('-----------解析线程：%s完成任务退出------------' %(threading.current_thread().getName())) break try: html,taskId = self.q_parse.get(block=False) soup = BeautifulSoup(html,'lxml') books = soup.select('div[class="bookslist"] > ul > li') print('----------------',len(books)) for book in books: self.lock.acquire() book_url = book.find('img').attrs['src'] book_title = book.select('h3 a')[0]['title'] book_author = book.select('p')[0].get_text() book_describe = book.select('p')[1].get_text() fp.write('%s\t%s\t%s\t%s\n'%(book_url,book_title,book_author,book_describe)) self.lock.release() self.q_parse.task_done() print('**********解析线程：%s完成了第%d页解析任务***********' %(threading.current_thread().getName(),taskId)) except : pass # 专心的负责网页解析，保存 def parse(fp): lock = Lock() for i in range(5): t = ParseThread(parse_queue,lock,fp) t.start() if __name__ == '__main__': crawl() fp = open('./book.txt','a',encoding='utf-8') parse(fp) # 队列join：队列中的任务必须结束，下面才会执行 task_queue.join() parse_queue.join() fp.close() exit_flag = True print('代码执行到这里！！！！！！！！！！！！！！')

多线程实现读书http://www.qwsy.com/shuku.aspx?&page=1 导包定义变量创建爬虫线程并启动爬虫线程创建解析线程并启动解析线程 Queue.get(block = True/False) join()锁定线程，确保线程全部执行完毕结束任务

网络毒刘认证博客专家 Python 公众号：刘旺学长数据分析因为同样的坑不想踩两次而写博客,也同样为了社会更好的进步... 其实 Python 已经是一个很老的编程语言了，到现在（2019年） Python 已经高龄 28 岁，比很多程序员的年龄都大。现在之所以这么流行和社区、人工智能AI的日益发展，有很大的关系。千里之行始于足下，还不开始学习 Python编程吗个人公众号：刘旺学长一名热爱分享技术的宝藏博主。公众号回复1024，有免费教程分享。

最新回复(0)