中国土地市场网
1.分析请求过程
在控制台中获取相应的参数
2.分析js并修改参数
function stringToHex(str
) {
var val
= "";
for (var i
= 0; i
< str
.length
; i
++) {
if (val
== "") val
= str
.charCodeAt(i
).toString(16); else val
+= str
.charCodeAt(i
).toString(16);
}
return val
;
}
function YunSuoAutoJump(text
) {
var width
= "1920";
var height
= "1080";
if (text
== null){
var screendate
= width
+ "," + height
;
} else {
var screendate
= text
;
}
var location
= "/default.aspx?tabid=226&security_verify_data=" + stringToHex(screendate
);
return location
;
}
var _
= process
.argv
.splice(2)
console
.log(YunSuoAutoJump(_
[0]));
构建相应的爬虫代码
#
-*- coding
: utf
-8 -*-
# @Time
: 2019/11/1 9:45
# @Author
:
import os
import re
import requests
from lxml
import etree
def
generate_signature(value
):
"""
generate _signature parameter
:param value
:share_url id
:return:signature string
"""
cwd
= os
.path
.dirname(__file__
)
p
= os
.popen('cd %s && node landchina.js %s' % (cwd
, value
))
return p
.readlines()[0]
def
landchina():
# 构建session会话
s
= requests
.Session()
headers
= {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.landchina.com',
'Pragma': 'no-cache',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
s
.headers
= headers
# 第一次请求
url
= "https://www.landchina.com/default.aspx?tabid=226"
resp1
= s
.get(url
=url
)
cookie1
= resp1
.headers
["Set-Cookie"]
cookie_re
= "(security_session_verify=\w+;)"
security_session_verify
= "".join(re
.findall(cookie_re
, cookie1
))
security_session_verify_url
= generate_signature("https://www.landchina.com/default.aspx?tabid=226").strip()
s
.headers
= {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': security_session_verify
+ " srcurl=" + security_session_verify_url
[
security_session_verify_url
.rfind("=") + 1:],
'Host': 'www.landchina.com',
'Pragma': 'no-cache',
'Referer': 'https://www.landchina.com/default.aspx?tabid=226',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
var_re
= "(security_verify_data.*)"
var = "".join(re
.findall(var_re
, generate_signature("")))
# 第二次请求
url2
= "https://www.landchina.com/default.aspx?tabid=226&" + var
resp2
= s
.get(url2
)
cookie2
= resp2
.headers
["Set-Cookie"]
cookie_re
= "(security_session_mid_verify=\w+;)"
security_session_mid_verify
= "".join(re
.findall(cookie_re
, cookie2
))
s
.headers
= {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '%s; %s' % (security_session_verify
, security_session_mid_verify
),
'Host': 'www.landchina.com',
'Pragma': 'no-cache',
'Referer': url2
,
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
# 第三次请求
rep
= s
.get(url
=url
)
mytree
= etree
.HTML(rep
.text
)
urls
= mytree
.xpath('//*[@id="TAB_contentTable"]//tr/td[@class="queryCellBordy"]/a')
for ur
in urls
:
title
= "".join(ur
.xpath(".//text()"))
c_url
= "https://www.landchina.com/"+"".join(ur
.xpath("./@href"))
data
= {
"title" : title
,
"c_url": c_url
,
}
print(data
)
if __name__
== '__main__':
landchina()