https://sports.163.com/{i} i in ['nba','cba','china']
import os import re import requests if not os.path.exists('网易新闻'): # 生成文件夹 os.mkdir('网易新闻') count = 0 for i in ['nba','cba','china']: # 获取所有的url response = requests.get(f'https://sports.163.com/{i}/') data = response.text url_res = re.findall('href="(https://sports.163.com/.*?)"', data) url_res = set(url_res) # 针对单个url for url in url_res: url_response = requests.get(url) url_data = url_response.text try: title = re.findall('<h1>(.*?)</h1>', url_data, re.S)[0] news_res = \ re.findall('<div class="post_text" id="endText" style="border-top:1px solid #ddd;">(.*?责任编辑:.*?)</span>', url_data, re.S)[0] # news_res = re.sub('<.*?>', '', news_res) except: continue title = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~,…]|\s', '', title) # 除掉标题所有的脏字符 title_path = os.path.join('网易新闻', f'{title}.txt') # 拼接出新闻的路径 f = open(title_path, 'w', encoding='utf8') f.write(news_res) f.flush() f.close() count += 1 print(f'完成{count}篇, {title} done...')转载于:https://www.cnblogs.com/dadazunzhe/p/11232546.html
相关资源:新浪网最最新爬虫