简单的分析一下这个爱拍视频 总结起来 就是及其不安全的一个网站 想获取登录态 很简单到只要设置cookie就ok了 也是很惊讶,然后就是正常爬取流程了。说明一下,建议设置https 然后登录安全要做好。
代码部分: 首先是获取下载链接 #下载爱拍 import requests import time from bs4 import BeautifulSoup AIPAI_SPICE_URL = 'http://home.aipai.com/17899407?action=card&sub=&sort=id&total=844&clicks=5617816&flowers=27350&bookTotal=0&page=' LOGIN_COOKIES = 'cookie部分 这部分需要先登录复制过来即可' cookies2 = dict(map(lambda x: x.split('='), LOGIN_COOKIES.split(";"))) def DownloadTagToTxt(url): res = requests.get(url, cookies=cookies2) bs_html = BeautifulSoup(res.text, 'lxml') h5_list = bs_html.find_all('h5') for h5 in h5_list: a_list = h5.find_all('a') #a_list = bs_html.find_all('h5') for a in a_list: a_str = a.get('href') if a_str.endswith('mp4'): # 打印出a标签 print(a_str) if __name__ == '__main__': for num in range(1,72): cur_str = str(num) AIPAI_URL = AIPAI_SPICE_URL+cur_str DownloadTagToTxt(AIPAI_URL)其次是下载的代码
import os import time import requests def downloadFile(name, url): ''' :param name:下载保存的名称 :param url: 下载链接 :return: ''' headers = {'Proxy-Connection': 'keep-alive'} r = requests.get(url, stream=True, headers=headers) length = float(r.headers['content-length']) f = open(name, 'wb') count = 0 count_tmp = 0 time1 = time.time() for chunk in r.iter_content(chunk_size=512): if chunk: f.write(chunk) count += len(chunk) if time.time() - time1 > 2: p = count / length * 100 speed = (count - count_tmp) / 1024 / 1024 / 2 count_tmp = count print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S') time1 = time.time() f.close() def formatFloat(num): return '{:.2f}'.format(num) if __name__ == '__main__': file_name = 'VideoLink.txt' for line in open(file_name): down_link = line.strip() split_list = down_link.split('/') down_link_name = split_list[len(split_list)-2]+'_'+split_list[len(split_list)-1] downloadFile(down_link_name,down_link) Giser_D 认证博客专家 C/C++ https://github.com/huifeng-kooboo编程爱好者,喜欢学习客户端技术和网站后端技术、对爬虫技术颇有研究