Python爬虫系列之美团网页美食版块商家数据爬取
小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发> 点击这里联系我们 <
微信请扫描下方二维码
代码仅供学习交流,请勿用于非法用途
直接上代码
import requests
import base64
import zlib
import time
import re
import json
import urllib.parse as urlparse
from zlip import addip
from tokenparser import getToken
proxyUrl = ""
timeout = 20
retry = 3
headers = {
"Referer": "https://www.meituan.com/zhoubianyou/1535535/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0",
}
def getIp(s):
pat = re.compile('(\d+\.\d+\.\d+\.\d+)登录')
try:
return re.findall(pat, s)[0]
except Exception as e:
pass
def getProxy():
while True:
proxyResp_ = ""
try:
proxyResp = requests.get(proxyUrl)
proxyResp_ = json.loads(proxyResp.content.decode("utf-8"))
ip = proxyResp_['data'][0]['IP']
proxy = {
"http": ip}
return proxy
except Exception as e:
print(e)
if "请在用户中心添加该白名单" in proxyResp_['msg']:
ip = getIp(proxyResp_['msg'])
addip(ip)
else:
print("代理提示:" + str(proxyResp_))
time.sleep(2)
continue
proxies = getProxy()
def updateProxy():
global proxies
proxies = getProxy()
def getHtml(url, header=None, parse=False):
for i in range(retry):
try:
resp = ""
if header is None:
resp = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
else:
resp = requests.get(url, headers=header, timeout=timeout, proxies=proxies)
res = resp.content.decode("utf-8")
if parse:
res = json.loads(res)
return res
except Exception as e:
updateProxy()
def getUrlParams(url):
params = urlparse.parse_qs(urlparse.urlparse(url).query)
paramDict = {
}
for param in params:
try:
paramDict[param] = params[param][0]
except Exception as e:
pass
return paramDict
def getBeforeUri(url):
try:
pat = re.compile("pn(\d+)/")
res = re.findall(pat, url)
if res is not None and len(res) == 1:
if res[0] != '1':
return url.replace("pn" + str(res[0]), "pn" + str(int(res[0]) - 1))
else:
return url.replace("pn" + str(res[0]), "")
except Exception as e:
pass
def getTotalPage():
uu = "https://bj.meituan.com/meishi/api/poi/getPoiList?cityName=%E5%8C%97%E4%BA%AC&cateId=0&areaId=0&sort=&dinnerCountAttrId=&page=1&userId=&uuid=3d72900ed9a144acab76.1600087319.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fbj.meituan.com%2Fmeishi%2Fpn2%2F&riskLevel=1&optimusCode=10"
token = getToken(uu)
uu += "&_token=" + token
res = getHtml(uu, None, True)
try:
totalCounts = int(res['data']['totalCounts'])
return totalCounts // 15 if totalCounts % 15 == 0 else (totalCounts // 15) + 1
except Exception as e:
pass
def getPoiList(page):
uu = "https://bj.meituan.com/meishi/api/poi/getPoiList?cityName=%E5%8C%97%E4%BA%AC&cateId=0&areaId=0&sort=&dinnerCountAttrId=&page=" + str(page) + "&userId=&uuid=3d72900ed9a144acab76.1600087319.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fbj.meituan.com%2Fmeishi%2Fpn2%2F&riskLevel=1&optimusCode=10"
token = getToken(uu)
uu += "&_token=" + token
res = getHtml(uu, None, True)
try:
return res['data']['poiInfos']
except Exception as e:
pass
def getPoiDetail(poiid):
for i in range(retry):
try:
url = "https://www.meituan.com/meishi/" + str(poiid) + "/"
html = getHtml(url)
pat1 = re.compile("window._appState = ({.*?});</script>")
data = json.loads(re.findall(pat1, html)[0])
print("店名: %s , 联系电话:%s, 人均:%s, 评分:%s , poiId: %s , 地址:%s " % (
data['detailInfo']['name'],
data['detailInfo']['phone'],
data['detailInfo']['avgPrice'],
data['detailInfo']['avgScore'],
data['detailInfo']['poiId'],
data['detailInfo']['address'],
))
return
except Exception as e:
updateProxy()
def main():
totalPage = getTotalPage()
if totalPage is not None and totalPage > 0:
for page in range(1, totalPage + 1):
poiList = getPoiList(page)
if poiList and len(poiList) > 0:
for poi in poiList:
try:
poiid = str(poi['poiId'])
getPoiDetail(poiid)
except Exception as e:
pass
if __name__ == '__main__':
main()
转载:https://blog.csdn.net/qq_41287993/article/details/108611323
查看评论