#使用了线程库
import threading
from queue import Queue
from bs4 import BeautifulSoup
import json
import requests
class ThreadCrawl(threading.Thread):
def __init__(self,threadNmae,pageQueue,dataQueue):
#threading.Thread.__init__(self)
#多个父类的话下面这个方便
super(ThreadCrawl,self).__init__( )
self.threadNmae=threadNmae
self.pageQueue=pageQueue
self.dataQueue=dataQueue
self.headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
def run(self):
print("启动"+self.threadNmae)
while not CRAWL_EXIT:
try:
#取出一个数字,先进先出
#1可选参数block默认值是true,不会结束,会进入阻塞状态,直到队列有新的数据
#2.如果队列为空,block为Flase的话,就会弹出一个Queue.empty()异常
page=self.pageQueue.get(False)
url="https://www.qiushibaike.com/8hr/page/"+str(page)+"/"
content=requests.get(url,headers=self.headers)
self.dataQueue.put(content)
except:
pass
print("结束"+self.threadNmae)
CRAWL_EXIT=False
PARSE_EXIT=False
def main():
#页面的队列可以存储10页
pageQueue=Queue(10)
#放入1-10 先进先出
for i in range(1,11):
pageQueue.put(i)
#采集结果的数据队列,参数为空
dataQueue=Queue()
#存储三个线程采集的名字
crawList=["采集线程1号","采集线程2号","采集线程3号"]
#存储三个采集线程
threadcrawl=[]
for threadNmae in crawList:
thread=ThreadCrawl(threadNmae,pageQueue,dataQueue)
thread.start()
threadcrawl.append(thread)
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT=True
print("Queue为空")
for thread in threadcrawl:
thread.join()
print("joining...............")
if __name__=="__main__":
main()
转载于:https://www.cnblogs.com/c-x-a/p/8027281.html