小言_互联网的博客

爬虫-瓜子二手车

363人阅读  评论(0)
import requests,re,json,time,random
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()

def request_html(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'uuid=f50f4be3-0e11-4e0d-d7e2-ad282bb42715; clueSourceCode=10103000312%2300; ganji_uuid=9459142464668822758037; sessionid=4a3ab0f9-0dba-4152-a135-1216a0b42892; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A58880429608%7D; user_city_id=73; cityDomain=yancheng; antipas=UL3U7i501f7530734474D9817; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f50f4be3-0e11-4e0d-d7e2-ad282bb42715%22%2C%22ca_city%22%3A%22zz%22%2C%22sessionid%22%3A%224a3ab0f9-0dba-4152-a135-1216a0b42892%22%7D; preTime=%7B%22last%22%3A1568901802%2C%22this%22%3A1568881044%2C%22pre%22%3A1568881044%7D',
        'Host': 'www.guazi.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    response = requests.get(url,headers=headers).content.decode('utf-8')
    return response
    # print(response)
    # with open('guzi.html','w',encoding='utf-8') as fp:
    #     fp.write(response)
# 第一次请求 , 为了获取城市列表
url = 'https://www.guazi.com/yancheng/buy'
response = request_html(url)
# print(response)
lis1 = []
tree = etree.HTML(response)
city = re.compile(r'"domain":"(.*?)"',re.S)
city_list = city.findall(response)
# print(city_list)
# time.sleep(0.5)
for i in city_list:
    # 具体城市
    city_url = 'https://www.guazi.com/{}/buy'.format(i)
    # print(city_url)
    response2 = request_html(city_url)
    tree2 = etree.HTML(response2)
    # 当前城市中文名
    # now_city = tree.xpath('.//p[@class="city-curr"]/text()')[0].strip()
    # city_dic = {'城市': now_city}
    # lis.append(city_dic)
    brand_list = tree2.xpath('.//div[@class="dd-all clearfix js-brand js-option-hid-info"]//a/@href')
    # print(brand_list)
    # time.sleep(0.5)
    for j in brand_list:
        # 50页数据
        brand_url1 = 'https://www.guazi.com'+j
        # print(brand_url1)
        for k in range(1,2):
            # time.sleep(0.5)
            #https://www.guazi.com/yancheng/benz/o1/
            brand_url = brand_url1.rstrip('#bread')+'o{}/'.format(k)

            response3 = request_html(brand_url)
            tree3 = etree.HTML(response3)
            # 车辆详细信息
            car_list = tree3.xpath('.//ul[@class="carlist clearfix js-top"]/li')
            # print(len(car_list))
            # 获取数据
            lis2 = []
            for car in car_list:
                dic = {}
                # 1.图片链接
                car_img_list = car.xpath('.//img/@src')
                dic['car_img'] = car_img_list[0]
                # 2.名称
                car_title_list = car.xpath('.//h2[@class="t"]/text()')
                dic['car_title'] = car_title_list[0]
                # 3.年限,公里数,服务
                car_year_list = car.xpath('.//div[@class="t-i"]/text()')
                dic['car_year'] = car_year_list[0]
                dic['car_km'] = car_year_list[1]
                dic['car_sever'] = car_year_list[2]
                # 4.现价
                car_price_list = car.xpath('.//div[@class="t-price"]/p/text()')
                dic['car_price'] = car_price_list[0] + '万'
                # 5.原价
                car_oprice_list = car.xpath('.//div[@class="t-price"]/em/text()')
                if car_oprice_list:
                    dic['car_oprice'] = car_oprice_list[0]
                else:
                    dic['car_oprice'] = 'None'
                # 6.补贴价格
                car_bprice_list = car.xpath('.//em[@class="icon-sale"]/span/text()')
                if car_bprice_list:
                    dic['car_bprice'] = car_bprice_list[0] + '元'
                else:
                    dic['car_bprice'] = 'None'
                # 7.标签
                car_tag_list = car.xpath('.//div[@class="t-price"]/i/text()')
                car_tag = ','.join(car_tag_list)
                dic['car_tag'] = car_tag
                # 8.详情链接
                car_detail_url_list = car.xpath('./a/@hre吗f')
                dic['car_detail_url'] = car_detail_url_list[0]
                lis2.append(dic)
                lis1 = lis1+lis2
            with open('guazi1.json','a',encoding='utf-8') as f:
                json.dump(lis1,f,ensure_ascii=False)


转载:https://blog.csdn.net/weixin_42766128/article/details/101305476
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场