import requests,re,json,time,random
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()
def request_html(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'uuid=f50f4be3-0e11-4e0d-d7e2-ad282bb42715; clueSourceCode=10103000312%2300; ganji_uuid=9459142464668822758037; sessionid=4a3ab0f9-0dba-4152-a135-1216a0b42892; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A58880429608%7D; user_city_id=73; cityDomain=yancheng; antipas=UL3U7i501f7530734474D9817; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f50f4be3-0e11-4e0d-d7e2-ad282bb42715%22%2C%22ca_city%22%3A%22zz%22%2C%22sessionid%22%3A%224a3ab0f9-0dba-4152-a135-1216a0b42892%22%7D; preTime=%7B%22last%22%3A1568901802%2C%22this%22%3A1568881044%2C%22pre%22%3A1568881044%7D',
'Host': 'www.guazi.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
response = requests.get(url,headers=headers).content.decode('utf-8')
return response
# print(response)
# with open('guzi.html','w',encoding='utf-8') as fp:
# fp.write(response)
# 第一次请求 , 为了获取城市列表
url = 'https://www.guazi.com/yancheng/buy'
response = request_html(url)
# print(response)
lis1 = []
tree = etree.HTML(response)
city = re.compile(r'"domain":"(.*?)"',re.S)
city_list = city.findall(response)
# print(city_list)
# time.sleep(0.5)
for i in city_list:
# 具体城市
city_url = 'https://www.guazi.com/{}/buy'.format(i)
# print(city_url)
response2 = request_html(city_url)
tree2 = etree.HTML(response2)
# 当前城市中文名
# now_city = tree.xpath('.//p[@class="city-curr"]/text()')[0].strip()
# city_dic = {'城市': now_city}
# lis.append(city_dic)
brand_list = tree2.xpath('.//div[@class="dd-all clearfix js-brand js-option-hid-info"]//a/@href')
# print(brand_list)
# time.sleep(0.5)
for j in brand_list:
# 50页数据
brand_url1 = 'https://www.guazi.com'+j
# print(brand_url1)
for k in range(1,2):
# time.sleep(0.5)
#https://www.guazi.com/yancheng/benz/o1/
brand_url = brand_url1.rstrip('#bread')+'o{}/'.format(k)
response3 = request_html(brand_url)
tree3 = etree.HTML(response3)
# 车辆详细信息
car_list = tree3.xpath('.//ul[@class="carlist clearfix js-top"]/li')
# print(len(car_list))
# 获取数据
lis2 = []
for car in car_list:
dic = {}
# 1.图片链接
car_img_list = car.xpath('.//img/@src')
dic['car_img'] = car_img_list[0]
# 2.名称
car_title_list = car.xpath('.//h2[@class="t"]/text()')
dic['car_title'] = car_title_list[0]
# 3.年限,公里数,服务
car_year_list = car.xpath('.//div[@class="t-i"]/text()')
dic['car_year'] = car_year_list[0]
dic['car_km'] = car_year_list[1]
dic['car_sever'] = car_year_list[2]
# 4.现价
car_price_list = car.xpath('.//div[@class="t-price"]/p/text()')
dic['car_price'] = car_price_list[0] + '万'
# 5.原价
car_oprice_list = car.xpath('.//div[@class="t-price"]/em/text()')
if car_oprice_list:
dic['car_oprice'] = car_oprice_list[0]
else:
dic['car_oprice'] = 'None'
# 6.补贴价格
car_bprice_list = car.xpath('.//em[@class="icon-sale"]/span/text()')
if car_bprice_list:
dic['car_bprice'] = car_bprice_list[0] + '元'
else:
dic['car_bprice'] = 'None'
# 7.标签
car_tag_list = car.xpath('.//div[@class="t-price"]/i/text()')
car_tag = ','.join(car_tag_list)
dic['car_tag'] = car_tag
# 8.详情链接
car_detail_url_list = car.xpath('./a/@hre吗f')
dic['car_detail_url'] = car_detail_url_list[0]
lis2.append(dic)
lis1 = lis1+lis2
with open('guazi1.json','a',encoding='utf-8') as f:
json.dump(lis1,f,ensure_ascii=False)
转载:https://blog.csdn.net/weixin_42766128/article/details/101305476
查看评论