小白一枚,作业需要,就搞了一个粗陋的版本出来。
基本功能么得问题,只是搞不明白为什么每页第一条新闻都会重复一次,暴力删除了…
话不多说,直接贴代码:
import requests from bs4 import BeautifulSoup from openpyxl import Workbook # 写入一行数据 def insertOne(value1,value2, sheet): row = [value1,value2] sheet.append(row) # 新建excel,并创建多个sheet if __name__ == "__main__": book = Workbook() # 新建1个自定义的sheet for i in range(0, 1): # 为每个sheet设置title,插入位置index sheet = book.create_sheet("sheet" + str(i + 1), i) sheets = book.get_sheet_names() url1 = 'https://www.***.edu.cn/publish/***news/****/index' url2 = '' url3 = '.html' url = url1 + url2 + url3 cnt = 0 while(True): res1 = requests.get(url) # 使用UTF-8编码 res1.encoding = 'UTF-8' # 使用剖析器为html.parser soup1 = BeautifulSoup(res1.text, 'html.parser') # 遍历每一个class=clearfix的节点 for news in soup1.select('.clearfix'): figcaption = news.select('figcaption') # 只选择长度大于0的结果 if len(figcaption) > 0: # 新闻时间 # time = news.select('.href')[0].text # 新闻标题 # titleunperfect = figcaption[0].text # 新闻链接 href = figcaption[0].select('a')[0]['href'] res2 = requests.get('https://www.***.edu.cn'+href) res2.encoding = 'UTF-8' soup2 = BeautifulSoup(res2.text, 'html.parser') h1 = soup2.select('h1') if(len(h1)>0): title = h1[0].text else: h2 = soup2.select('h2') title = h2[0].text date = href[27:35] insertOne(date, title, book.get_sheet_by_name(sheets[0])) #打印 print(title,date) cnt = cnt + 1 s_cnt = repr(cnt) url2 = '_' + s_cnt url = url1 + url2 + url3 if cnt>11: break # 保存数据到.xlsx文件 book.save("2-2_data.xlsx")
