# -*-coding:utf-8-*-
'''
FileName:LaG爬取岗位信息
CreatTime:2018-4-10
Author: ___dx___
FileDescript:
'''
import requests
import xlwt
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# https校验证书
class Lagou_job(object):
def __init__(self):
self.url =
'https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false'
self.headers =
{
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Referer':
'https://www.lagou.com/jobs/list_测试?city=全国&cl=false&fromSearch=true&labelWords=&suginput=',
'Connection':
'keep - alive',
'Content-Type':
'application/x-www-form-urlencoded; charset=UTF-8',
'Origin':
'https://www.lagou.com',
'X-Anit-Forge-Code':
'0',
'X-Anit-Forge-Token':
'None',
'X-Requested-With':
'XMLHttpRequest'
}
# 抓取接口函数
def getJobList(self, page):
self.data =
{
'first':
'true',
'pn': page,
'kd':
'测试'
}
session =
requests.Session()
res =session.post(self.url, data=self.data, headers=
self.headers)
result =
res.json()
print(result)
# debug
print(res.status_code)
jobs = result[
'content'][
'positionResult'][
'result']
return jobs
# 抓取结果存入excel
def saveExcel(self):
excelTabel = xlwt.Workbook()
# 创建excel对象
# 如果对一个单元格重复操作,会引发
# returns error:
# Exception: Attempt to overwrite cell:
# sheetname=u'sheet 1' rowx=0 colx=0
# 所以在打开时加cell_overwrite_ok=True 解决
sheet_1 = excelTabel.add_sheet(
'daixiang', cell_overwrite_ok=True)
#创建sheet页
sheet_1.write(0, 0, u
'公司全名')
sheet_1.write(0, 1, u
'公司简称')
sheet_1.write(0, 2, u
'城市')
sheet_1.write(0, 3, u
'区域')
sheet_1.write(0, 4, u
'工作性质')
sheet_1.write(0, 5, u
'职位名称')
sheet_1.write(0, 6, u
'薪资范围')
sheet_1.write(0, 7, u
'职位')
sheet_1.write(0, 8, u
'工作年限')
sheet_1.write(0, 9, u
'公司规模')
sheet_1.write(0, 10, u
'学历要求')
n = 1
for page
in range(1, 2):
# 前99页
for job
in self.getJobList(page=
page):
if '' in job[
'workYear']
and u
'' in job[
'jobNature']
and u
'' in job[
'education']:
if '' in job[
'workYear']
and u
'全职' in job[
'jobNature']
and u
'深圳' in job[
'city']:
sheet_1.write(n, 0, job['companyFullName'])
sheet_1.write(n, 1, job[
'companyShortName'])
sheet_1.write(n, 2, job[
'city'])
sheet_1.write(n, 3, job[
'district'])
sheet_1.write(n, 4, job[
'jobNature'])
sheet_1.write(n, 5, job[
'positionName'])
sheet_1.write(n, 6, job[
'salary'])
sheet_1.write(n, 7, job[
'secondType'])
sheet_1.write(n, 8, job[
'workYear'])
sheet_1.write(n, 9, job[
'companySize'])
sheet_1.write(n, 10, job[
'education'])
n += 1
print (job[
'companyShortName'],job[
'salary'])
#print ('{},{}'.format(job['companyShortName'].encode('utf-8'),job['salary'].encode('utf-8')))
#print "{0[0]} is {0[1]} years old".format(li)
#print {0}{1}.format(job['companyShortName'], job['salary'])
#print('[{name:<{len}}\tx'.format(name=job['companyShortName'] + ']', len=50 - len(job['companyShortName'].encode('utf-8')) + len(job['companyShortName'])))
# 保存文件到excel
#excelTabel.save('daidai.xls')
excelTabel.save(
"深圳测试_By_dx.xls")
if __name__ ==
'__main__':
lagou_job =
Lagou_job()
#lagou_job.getJobList(1)
lagou_job.saveExcel()
转载于:https://www.cnblogs.com/jsondai/p/11393056.html
相关资源:python爬虫拉钩网