飞道的博客

Python爬虫系列之美团网页美食版块商家数据爬取

468人阅读  评论(0)

Python爬虫系列之美团网页美食版块商家数据爬取

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发> 点击这里联系我们 <

微信请扫描下方二维码

代码仅供学习交流,请勿用于非法用途

直接上代码

import requests
import base64
import zlib
import time
import re
import json
import urllib.parse as urlparse
from zlip import addip
from tokenparser import getToken



proxyUrl = ""
timeout = 20
retry = 3

headers = {
   
    "Referer": "https://www.meituan.com/zhoubianyou/1535535/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0",
}


def getIp(s):
    pat = re.compile('(\d+\.\d+\.\d+\.\d+)登录')
    try:
        return re.findall(pat, s)[0]
    except Exception as e:
        pass


def getProxy():
    while True:
        proxyResp_ = ""
        try:
            proxyResp = requests.get(proxyUrl)
            proxyResp_ = json.loads(proxyResp.content.decode("utf-8"))
            ip = proxyResp_['data'][0]['IP']
            proxy = {
   "http": ip}
            return proxy
        except Exception as e:
            print(e)
            if "请在用户中心添加该白名单" in proxyResp_['msg']:
                ip = getIp(proxyResp_['msg'])
                addip(ip)
            else:
                print("代理提示:" + str(proxyResp_))
                time.sleep(2)
                continue

proxies = getProxy()

def updateProxy():
    global proxies
    proxies = getProxy()


def getHtml(url, header=None, parse=False):
    for i in range(retry):
        try:
            resp = ""
            if header is None:
                resp = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
            else:
                resp = requests.get(url, headers=header, timeout=timeout, proxies=proxies)
            res = resp.content.decode("utf-8")
            if parse:
                res = json.loads(res)
            return res
        except Exception as e:
            updateProxy()


def getUrlParams(url):
    params = urlparse.parse_qs(urlparse.urlparse(url).query)
    paramDict = {
   }
    for param in params:
        try:
            paramDict[param] = params[param][0]
        except Exception as e:
            pass
    return paramDict


def getBeforeUri(url):
    try:
        pat = re.compile("pn(\d+)/")
        res = re.findall(pat, url)
        if res is not None and len(res) == 1:
            if res[0] != '1':
                return url.replace("pn" + str(res[0]), "pn" + str(int(res[0]) - 1))
            else:
                return url.replace("pn" + str(res[0]), "")
    except Exception as e:
        pass


def getTotalPage():
    uu = "https://bj.meituan.com/meishi/api/poi/getPoiList?cityName=%E5%8C%97%E4%BA%AC&cateId=0&areaId=0&sort=&dinnerCountAttrId=&page=1&userId=&uuid=3d72900ed9a144acab76.1600087319.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fbj.meituan.com%2Fmeishi%2Fpn2%2F&riskLevel=1&optimusCode=10"
    token = getToken(uu)
    uu += "&_token=" + token
    res = getHtml(uu, None, True)
    try:
        totalCounts = int(res['data']['totalCounts'])
        return totalCounts // 15 if totalCounts % 15 == 0 else (totalCounts // 15) + 1
    except Exception as e:
        pass


def getPoiList(page):
    uu = "https://bj.meituan.com/meishi/api/poi/getPoiList?cityName=%E5%8C%97%E4%BA%AC&cateId=0&areaId=0&sort=&dinnerCountAttrId=&page=" + str(page) + "&userId=&uuid=3d72900ed9a144acab76.1600087319.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fbj.meituan.com%2Fmeishi%2Fpn2%2F&riskLevel=1&optimusCode=10"
    token = getToken(uu)
    uu += "&_token=" + token
    res = getHtml(uu, None, True)
    try:
        return res['data']['poiInfos']
    except Exception as e:
        pass


def getPoiDetail(poiid):
    for i in range(retry):
        try:
            url = "https://www.meituan.com/meishi/" + str(poiid) + "/"
            html = getHtml(url)
            pat1 = re.compile("window._appState = ({.*?});</script>")
            data = json.loads(re.findall(pat1, html)[0])
            print("店名: %s , 联系电话:%s, 人均:%s, 评分:%s , poiId: %s , 地址:%s " % (
                data['detailInfo']['name'],
                data['detailInfo']['phone'],
                data['detailInfo']['avgPrice'],
                data['detailInfo']['avgScore'],
                data['detailInfo']['poiId'],
                data['detailInfo']['address'],
            ))
            return
        except Exception as e:
            updateProxy()


def main():
    totalPage = getTotalPage()
    if totalPage is not None and totalPage > 0:
        for page in range(1, totalPage + 1):
            poiList = getPoiList(page)
            if poiList and len(poiList) > 0:
                for poi in poiList:
                    try:
                        poiid = str(poi['poiId'])
                        getPoiDetail(poiid)
                    except Exception as e:
                        pass


if __name__ == '__main__':
    main()


转载:https://blog.csdn.net/qq_41287993/article/details/108611323
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场