直接上代码
#coding=UTF-8 from urllib.request import urlopen from pyquery import PyQuery as pq import re import pymongo import threading client=pymongo.MongoClient(host='localhost',port=27017) table=client.taobao.mutitry47160 lock=threading.Lock() i = 1 def save_to_mongo(result): try: if table.insert(result): print('存储到Mongo成功') except Exception: print('存储到Mongo失败',result) def download_son(Son_link,l): Sonson_link='https://www.7160.com'+Son_link+'index_'+str(l)+'.html' doc3=pq(Sonson_link,encoding='gbk') image_son=doc3('.picsbox.picsboxcenter p a img').attr('src') title_son=doc3('.picsbox.picsboxcenter p a img').attr('alt') product2={'image':image_son,'title':title_son} print(product2) save_to_mongo(product2) def father_link(): lock.acquire() global i i += 1 lock.release() url='https://www.7160.com/rentiyishu/list_1_'+str(i)+'.html' print('--------------------------------------'+str(i)+'--------------------------------------') doc=pq(url,encoding='gbk') items=doc('.news_bom-left li').items() for item in items: Son_link=item.find('a').attr('href') doc2=pq('https://www.7160.com'+Son_link,encoding='gbk') image_main=doc2('.picsbox.picsboxcenter p a img').attr('src') title_main=doc2('.picsbox.picsboxcenter p a img').attr('alt') product={'image':image_main,'title':title_main} save_to_mongo(product) #获取页码 page_num=doc2('body > div > div.center > div.NEWS > div.picmainer > div.itempage > a:nth-child(1)').text() page_num=re.findall(r"\d+\.?\d*",page_num) try: page_num=int(page_num[0]) print('共%d页,开始爬取'%page_num) for l in range(2,page_num+1): r1=threading.Thread(target=download_son,args=(Son_link,l)) r1.start() except Exception: pass def main(): for i in range(1,108): t1=threading.Thread(target=father_link) t1.start() if __name__ == '__main__': main()