re,xpath,BeautifulSoup三种方法爬取古诗词网上诗歌

mac2022-06-30  19

 re,xpath ,bs4对同一个页面的解析速度发现re比xpath快接近10倍,xpath比bs4快接近10倍可见要想追求极致速度,使用正则表达式解析有多重要

1、re解析的代码

# 使用正则表达式解析网页元素 # 关键点:直接找每个个体里面相同位置的元素,用findall一次提取出来到列表中 import requests import re DATA = [] def getHTMLtext(url,headers,timeout=10):     try :         resp = requests.get(url,headers=headers,timeout=timeout)         resp.raise_for_status         resp.encoding = 'utf-8'         return resp.text     except:         return '' def reParser(text):     name_list = re.findall(r'<div class="yizhu".*?<b>(.*?)</b>',text,re.S)  #re.DOTALL           dynasty_list = re.findall(r'<p class="source">.*?target="_blank">(.*?)</a>',text,re.S)           author_list = re.findall(r'<p class="source">.*?target="_blank">.*?</a>.*?target="_blank">(.*?)</a>',text,re.S)           row_content_list = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.S)     content_list = []     for content in row_content_list:         temp = re.sub(r'<.*?>','',content)  #这里一定要记得不要写成了贪婪匹配哦         content_list.append(temp.strip()) #去除空格           likes_list = re.findall(r'<span> (\d*?)</span>',text,re.S)           for value in zip(name_list,dynasty_list,author_list,content_list,likes_list):         name,dynasty,author,content,likes = value         poetry_dict = {             '诗词名':name,             '朝代':dynasty,             '作者':author,             '内容':content,             '点赞数':likes         }         DATA.append(poetry_dict)           def print_poetry(data):     for every_poetry in data:             print(every_poetry['诗词名'])             print(every_poetry['朝代'] + ':' + every_poetry['作者'] )             print(every_poetry['内容'])             print('有{}人喜欢这首诗(词)哦'.format(every_poetry["点赞数"]))             print("\n"+'*'*50+"\n")           if __name__ == '__main__':     row_url = 'https://www.gushiwen.org/default_{}.aspx'     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}     num = input('请输入要爬取的页数(1-100):')     for i in range(eval(num)):         url = row_url.format(i+1)         text = getHTMLtext(url,headers)        if text == '':             print('url: {} 访问失败'.format(url))         else:             reParser(text)     DATA.sort(key=lambda x: int(x['点赞数']),reverse = True)     TOP10 = DATA[:10]     print_poetry(TOP10)2、Xpath版本 from lxml import etree DATA = [] def getHTMLtext(url,headers,timeout=10):     try :         resp = requests.get(url,headers=headers,timeout=timeout)         resp.raise_for_status         resp.encoding = 'utf-8'         return resp.text     except:         return '' def xpathParser(text):     htmlElement = etree.HTML(text)  # <class 'lxml.etree._Element'>     name_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[1]/a/b/text()')     dynasty_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[2]/a[1]/text()')     author_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[2]/a[2]/text()')     content_list = []     poetries = htmlElement.xpath('//div[@class="contson" and contains(@id,"contson")]') #返回一个列表,里面每一个都是'lxml.etree._Element'    # print(etree.tostring(poetries[0],encoding = 'utf-8').decode('utf-8'))     for poetry in poetries:         row_content = ''.join(poetry.xpath('.//text()'))#这里的.可千万不能掉,否则会忽略掉poetry哦         content_list.append(row_content.replace('\n',''))     row_likes_list = htmlElement.xpath('//a[contains(@id,"agood")]/span/text()')      likes_list = [int(like.strip()) for like in row_likes_list]     for value in zip(name_list,dynasty_list,author_list,content_list,likes_list):         name,dynasty,author,content,likes = value         poetry_dict = {             '诗词名':name,             '朝代':dynasty,             '作者':author,             '内容':content,             '点赞数':likes         }         DATA.append(poetry_dict)    def print_poetry(data):     for every_poetry in data:             print(every_poetry['诗词名'])             print(every_poetry['朝代'] + ':' + every_poetry['作者'] )             print(every_poetry['内容'])             print('有{}人喜欢这首诗(词)哦'.format(every_poetry["点赞数"]))             print("\n"+'*'*50+"\n")           if __name__ == '__main__':     row_url = 'https://www.gushiwen.org/default_{}.aspx'     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}     num = input('请输入要爬取的页数(1-100):')     for i in range(eval(num)):         url = row_url.format(i+1)         text = getHTMLtext(url,headers)         if text == '':             print('url: {} 访问失败'.format(url))         else:             xpathParser(text)     DATA.sort(key=lambda x: int(x['点赞数']),reverse = True)     TOP10 = DATA[:10]     print_poetry(TOP10)3、bs4版本 # 使用bs4提取网页,先利用find_all解析 import requests from bs4 import BeautifulSoup DATA = [] def getHTMLtext(url,headers,timeout=10):     try :         resp = requests.get(url,headers=headers,timeout=timeout)         resp.raise_for_status         resp.encoding = 'utf-8'         return resp.text     except:         return '' def bs4_find_all_Parser(text):     soup = BeautifulSoup(text,'lxml')     sons = soup.find_all('div',class_ = "sons")[:10] #返回一个<class 'bs4.element.ResultSet'>,每一个元素都是Tag类型     # 注意:上一步里面返回了一些其他的元素,我们可以提取出前面的10项,那是我们需要用到的     for son in sons:         name = son.find('b').string         print(name)         dynasty_author = son.find('p',class_="source").get_text()         print(dynasty_author)         content = son.find('div',class_="contson").get_text().strip()         print(content)         like = son.find_all('span')[1].string.strip()         print('点赞数:'+like)         print('\n'+'*'*30+'\n')              if __name__ == '__main__':         url = 'https://www.gushiwen.org/default_1.aspx'         headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}         text = getHTMLtext(url,headers)         if text == '':             print('url: {} 访问失败'.format(url))         else:             bs4_find_all_Parser(text)

转载于:https://www.cnblogs.com/valorchang/p/11582565.html

最新回复(0)