小言_互联网的博客

selenium无登录状态爬取Boss直聘

275人阅读  评论(0)

BOSS是我很早就实现数据爬取的网站,那会直接用request即可,最近再次尝试以前的代码发现,它做了一些反爬处理,当你直接访问例如https://www.zhipin.com/c101210100/b_西湖区/?query=数据分析杭州这样的网址,会进行一个二次跳转,就算获取跳转后的网址再访问也是不行的,因为它的cookies里有一个_zp_stoken_,是js加密生成的,尽力一番之后宣告难以破解,直接祭出大杀器selenium+随机user-agent,轻松解决

思路就是用selenium进入要爬取的页面,通过随机的user-agent高效爬取数据,我这边随机的user-agent是保存在本地文件headers.csv中的,有需要可以找我要,大概有2万个,那么就上代码了

# -*- coding: utf-8 -*-
import json
import os
import re
from urllib.parse import urlencode
import fake_useragent
from scrapy.selector import Selector
import requests
import time
from lxml import etree
from selenium import webdriver
import pandas as pd



# 方法二,从本地文件夹获取
location = os.getcwd() + 'headers.csv'
ka = fake_useragent.UserAgent(path=location, verify_ssl=False, use_cache_server=False)

# 构造请求头User-Agent
headers = {
		'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
		'accept-encoding':'gzip, deflate, br',
		'accept-language':'zh-CN,zh;q=0.9',
		'cache - control': 'max - age = 0',
		'referer':'https://www.zhipin.com/',
		'sec-fetch-mode':'navigate',
		'sec-fetch-site':'same-origin',
		'sec-fetch-user':'?1',
		'upgrade-insecure-requests':'1',
		'user-agent': ka.random,
		'X-Requested-With': 'XMLHttpRequest'
		}
#用于存放所有数据
data_my = []
#这是请求岗位职责的地址相同部分
get_url = 'https://www.zhipin.com/wapi/zpgeek/view/job/card.json?'


def main():


	area_list ={'西湖区','余杭区','滨江区','江干区','萧山区','拱墅区','下城区','上城区'}

	chromedriver_path = 'C:/Users/machenike/Anaconda3/Scripts/chromedriver.exe'
	# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
	options = webdriver.ChromeOptions()
	options.add_experimental_option('excludeSwitches', ['enable-automation'])
	driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
	driver.maximize_window()

	for area in area_list:

		loginurl = 'https://www.zhipin.com/c101210100/b_' +area+'/?query=数据分析杭州'
		driver.get(loginurl)
		time.sleep(3.5)
		# Selenium为我们提供了get_cookies来获取登录cookies
		cookies = driver.get_cookies()
		jsonCookies = json.dumps(cookies)
		# 把cookies保存在本地
		with open('bossCookies.json', 'w') as f:
			f.write(jsonCookies)

		#获取信息
		get_detail(driver,area)
	# 写入本地CSV文件
	df = pd.DataFrame(data_my)
	df.to_csv('./shuju.csv', index=None, encoding='utf-8-sig', mode='a')
	time.sleep(0.5)
	print('已保存该数据到本地shuju.csv文件夹')
	driver.close()

def get_detail(driver,area):
	source = etree.HTML(driver.page_source)
	node_list =source.xpath("//div[@class='job-primary']")
	# 用来存储所有的item字段\

	for node in node_list:
		item = {}
		item['链接'] = node.xpath("./div[@class='info-primary']//a/@href")[0]
		item['职位'] = node.xpath("./div[@class='info-primary']//a/div[@class='job-title']")[0].text
		item['薪资'] = node.xpath("./div[@class='info-primary']//a/span")[0].text
		item['工作地点'] = area
		item['工作经验'] = node.xpath("./div[@class='info-primary']//p/text()[2]")[0]
		item['公司名称'] = node.xpath("./div[@class='info-company']//a")[0].text
		item['所处行业'] = node.xpath("./div[@class='info-company']/div[@class='company-text']/p")[0].text
		item['融资轮'] = node.xpath("./div[@class='info-company']/div[@class='company-text']/p//text()[2]")[0]

		item['jid'] = node.xpath(".//div[@class='info-primary']/h3/a/@data-jid")[0]
		item['lid'] = node.xpath(".//div[@class='info-primary']/h3/a/@data-lid")[0]
		ajson = get_info(item['jid'], item['lid'])
		item['岗位职责'] = get_json(ajson)
		print(item)
		data_my.append(item)

	#翻页
	if  source.xpath('//*[@id="main"]/div/div[3]/div[3]//a[@class="next"]'):
		next_page=driver.find_element_by_xpath('//*[@id="main"]/div/div[3]/div[3]//a[@class="next"]')
		driver.execute_script("arguments[0].click();", next_page)
		time.sleep(3.5)
		# Selenium为我们提供了get_cookies来获取登录cookies
		cookies = driver.get_cookies()
		jsonCookies = json.dumps(cookies)
		# 把cookies保存在本地
		with open('bossCookies.json', 'w') as f:
			f.write(jsonCookies)
		get_detail(driver,area)

def get_info(jid, lid):
	params = {
		'jid': jid,
		'lid': lid
	}

	# 获取cookies
	with open('bossCookies.json', 'r', encoding='utf-8') as f:
		listcookies = json.loads(f.read())

	# 把获取的cookies处理成dict类型
	cookies_dict = dict()
	for cookie in listcookies:
		# 在保存成dict时,只要cookies中的name和value
		cookies_dict[cookie['name']] = cookie['value']

	requests.adapters.DEFAULT_RETRIES = 5
	s = requests.session()
	# 关闭多余进程
	s.keep_alive = False
	#请求ajax获取岗位职责
	re = requests.get(get_url + urlencode(params), headers=headers, cookies=cookies_dict)
	time.sleep(0.2)
	if re.status_code == 200:
		vjson = re.json()
		return vjson
	else:
		print("获取失败")


def get_json(js):
	#处理字符串,由于返回的岗位职责是一个包含html的json数据,需要处理一下
	if js:
		json_content = js.get('zpData').get('html')
		content = Selector(text=json_content)
		content_text = content.css(".detail-bottom-text::text").re("[\u4e00-\u9fa5_a-zA-Z0-9]+")
		return content_text
	else:
		print("未获取数据")

if __name__ == '__main__':
	main()

	print("结束---------------------------------")

最后是这样的


转载:https://blog.csdn.net/weixin_42195144/article/details/101377386
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场