小言_互联网的博客

网络爬虫爬取拉勾招聘网

470人阅读  评论(0)

网络爬虫爬取拉勾招聘网

  • 搭配好环境
  • 复制以下代码
# -*- coding: utf-8 -*-
"""
Created on Mon Sep  7 21:44:39 2020

@author: ASUS
"""


import requests
import time
import json
import xlwt

workbook = xlwt.Workbook(encoding=' utf-8')
mysheet = workbook.add_sheet('mysheet')

mysheet.write(0, 0, 'positionId')
mysheet.write(0, 1, 'positionName')
mysheet.write(0, 2, 'companyId')
mysheet.write(0, 3, 'companyFullName')
mysheet.write(0, 4, 'city')
mysheet.write(0, 5, 'companyLabelList')
mysheet.write(0, 6, 'companyLogo')
mysheet.write(0, 7, 'companyShortName')
mysheet.write(0, 8, 'companySize')
mysheet.write(0, 9, 'createTime')
mysheet.write(0, 10, 'district')
mysheet.write(0, 11, 'education')
mysheet.write(0, 12, 'financeStage')
mysheet.write(0, 13, 'firstType')
mysheet.write(0, 14, 'formatCreateTime')
mysheet.write(0, 15, 'industryField')
mysheet.write(0, 16, 'jobNature')
mysheet.write(0, 17, 'lastLogin')
mysheet.write(0, 18, 'latitude')
mysheet.write(0, 19, 'linestaion')
mysheet.write(0, 20, 'longitude')
mysheet.write(0, 21, 'matchScore')


mysheet.write(0, 22, 'positionAdvantage')
mysheet.write(0, 23, 'positionId')
mysheet.write(0, 24, 'positionLables')
mysheet.write(0, 25, 'positionName')
mysheet.write(0, 26, 'secondType')
mysheet.write(0, 27, 'skillLables')
mysheet.write(0, 28, 'stationname')
mysheet.write(0, 29, 'subwayline')
mysheet.write(0, 30, 'thirdType')
mysheet.write(0, 31, 'workYear')

def main(kd,pages,row):
    # 通过访问主网页获取cookies和session
    url1 = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
    # 提交ajax请求,获取json数据
    url = "https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false"
    # 请求头
    headers = {
   
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E5%85%A8%E5%9B%BD',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Host': 'www.lagou.com'
    }

    # 使用data来决定获取多少页的json数据
    for page in range(1, pages):
        data = {
   
            'first': 'false',
            'pn': page,
            'kd': 'python'
        }
        data['kd']=kd
        s = requests.Session()  # 建立session
        s.get(url=url1, headers=headers, timeout=1)
        cookie = s.cookies  # 获取cookie
        respon = s.post(url=url, headers=headers, data=data, cookies=cookie, timeout=3)
        time.sleep(1)
        #print(respon.text)
        result = json.loads(respon.text)
        info = result["content"]["positionResult"]["result"]
        print(len(info))
        for j in info:
        
            mysheet.write(row, 0, j['positionId'])
            mysheet.write(row, 1, j['positionName'])
            mysheet.write(row, 2, j['companyId'])
            mysheet.write(row, 3, j['companyFullName'])
            mysheet.write(row, 4, j['city'])
            mysheet.write(row, 5, j['companyLabelList'])
            mysheet.write(row, 6, j['companyLogo'])
            mysheet.write(row, 7, j['companyShortName'])
            mysheet.write(row, 8, j['companySize'])
            mysheet.write(row, 9, j['createTime'])
            mysheet.write(row, 10, j['district'])
            mysheet.write(row, 11, j['education'])
            mysheet.write(row, 12, j['financeStage'])
            mysheet.write(row, 13, j['firstType'])
            mysheet.write(row, 14, j['formatCreateTime'])
            mysheet.write(row, 15, j['industryField'])
            mysheet.write(row, 16, j['jobNature'])
            mysheet.write(row, 17, j['lastLogin'])
            mysheet.write(row, 18, j['latitude'])
            mysheet.write(row, 19, j['linestaion'])
            mysheet.write(row, 20, j['longitude'])
            mysheet.write(row, 21, j['matchScore'])
    
            mysheet.write(row, 22, j['positionAdvantage'])
            mysheet.write(row, 23, j['positionId'])
            mysheet.write(row, 24, j['positionLables'])
            mysheet.write(row, 25, j['positionName'])
            mysheet.write(row, 26, j['secondType'])
            mysheet.write(row, 27, j['skillLables'])
            mysheet.write(row, 28, j['stationname'])
            mysheet.write(row, 29, j['subwayline'])
            mysheet.write(row, 30, j['thirdType'])
            mysheet.write(row, 31, j['workYear'])
            row=row+1
    workbook.save('py3.xls')
# 获取前两页的职位json信息
kd=input('输入关键字:')
pages=int(input('输入要爬取多少页:'))
main(kd,pages,1)
# 结果如下:
# {"resubmitToken":null,"requestId":null,"msg":null,"success":true,"content":{"hrInfoMap":{"6187967":{"userId":11765418,"phone":null,"positionName":"招聘经理",........."pageSize":15},"code":0}

  • 其中输入关键字,比如java,python等等
  • 第二步,输入需要爬的页数。
  • 结果

转载:https://blog.csdn.net/weixin_45019934/article/details/108457709
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场