Python爬虫入门

mac2025-02-06  30

爬虫

requests

get

url = '' resp = requests.get(url, verify=False) page_source = resp.content.decode('utf-8')

post

url = '' data = { '': '' } header = { 'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest' } resp = requests.post(url, headers=header, data=data, cookies=resp.cookies, verify=False) resp_json = json.loads(resp.content.decode('utf-8'))

session

ck = 'a=b' session = requests.Session() resp = session.post(url, data=data, cookies=str_to_cookie(ck), verify=False) resp2 = session.post(url2, data=data, verify=False)

cookie工具类

def cookie_to_str(cookies): """cookie对象转字符""" cookie_str = '' for s in cookies: cookie_str += s.name + "=" + s.value + ';' return cookie_str def dict_to_str(cookies): """cookie dict转字符""" cookie_str = '' for x, y in cookies.items(): cookie_str += x + "=" + y + ';' return cookie_str def str_to_cookie(cookies): """字符转cookie字典""" dict = {} cks = cookies.split(';') for ck in cks: ck_kv = ck.split('=') if len(ck_kv) == 2: dict[ck_kv[0]] = ck_kv[1] return dict

xpath

pip install lxml

html = etree.HTML(page_source) html_data = html.xpath('/html/body/div/ul/li/a') for i in html_data: print(i.text)

// 相对路径 / 绝对路径

例子 .xpath('//li/a/text()') .xpath('//li/a//@href') .xpath('//li/a[@href="link.html"]') .xpath('//li[last()]/a/text()') .xpath('//a[contains(@href, "link")]') .xpath('//a[re:test(@id, "i\d+")]/text()')

最新回复(0)