思路:
1、获取一个景点的id,例如
https://www.meituan.com/zhoubianyou/40760851/
最后一个数字就是id
2、直接调用
https://www.meituan.com/ptapi/poi/getcomment
将id和offset参数传进去,pageSize大于50即可,无需登录
import os
import random
import requests
from spider.get_ip import getIpList, getHeader
from utils.read_write import writeOneCsv, readCsv, writeOneJson, readJson, writeCsv
from utils.time_change import timestamp_datetime
save_dir = 'F:\data\other\meituan\json\\'
proxys = getIpList()
headers = getHeader()
os.chdir(r'F:\data\other\meituan')
def search(id,offset):
url = 'https://www.meituan.com/ptapi/poi/getcomment?id={}&offset={}&pageSize=10000&mode=0&sortType=1'.format(id,offset)
try:
requests.DEFAULT_RETRIES = 15
r = requests.get(url, headers=random.choice(headers),proxies=random.choice(proxys))
dic = r.json()
if 'total' in dic.keys():
if dic['total']>0:
writeOneJson(dic,save_dir+str(id)+'_'+str(offset)+'.json')
else:
print('xiaoyu')
else:
print('total')
print(url)
except Exception as e:
print(e)
print(url)
def dealData():
files = os.listdir(save_dir)
data = []
for file in files:
id = file.split('.')[0]
data1 = readJson(save_dir+file)
for one in data1:
comment = one['comment']
commentTime = one['commentTime']
commentTime = timestamp_datetime(commentTime)
data.append([id,comment,commentTime])
writeCsv(data,'所有景点的评论数据1.csv')
if __name__ == '__main__':
# dealData()
dir = 'F:\data\other\meituan\json_old\\'
filename = 'url.csv'
files = os.listdir(dir)
data = readCsv(filename)
# 第一个还没抓完
for one in data:
id = one[0].split('/')[4]
file = str(id)+'_'+str(0)+'.json'
data1 = readJson(dir + file)
length = data1['total']
for i in range(0,length/50+1):
search(id,i)
转载:https://blog.csdn.net/qq_30803353/article/details/108027500
查看评论