Python爬虫系列之明通市场数据爬取
小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发> 点击这里联系我们 <
微信请扫描下方二维码
代码仅供学习交流,请勿用于非法用途,加密算法不提供,仅供参考学习
直接上代码
import requests
import json
import time
import configparser
from queue import Queue
import os
import xlrd
import urllib.parse
import xlwt
'''
代码仅供学习,请勿异常使用
'''
headers = {
"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.15(0x17000f31) NetType/WIFI Language/zh_CN",
}
excelTitle = ["日期", "产品报价", "铺位"]
excelPwd = os.getcwd() + "/excels/"
if not os.path.exists(gexcelPwd):
os.mkdir(gexcelPwd)
cf = configparser.ConfigParser()
try:
cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")
except Exception as e:
print("程序目录下不存在conf.ini配置文件~")
exit(0)
def getConf(sec, key):
try:
return cf.get(sec, key)
except Exception as e:
print(e)
print("未得到以下配置:" + sec + " - " + key)
exit(0)
threadNums = 1
try:
threadNums = int(getConf("app-sys", "threadNums"))
if threadNums <= 0:
threadNums = 1
except Exception as e:
threadNums = 1
def postHtml(url, data):
for i in range(3):
try:
resp = requests.post(url, data=data, headers=headers)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
pass
def getSign(page):
while True:
try:
resp = os.popen('node encrypt.js ' + str(page))
return resp.buffer.read()
except Exception as e:
pass
def getCurrentTime():
return str(time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())))
class mtSpider(threading.Thread):
def __init__(self, keywordQueue, index, *args, **kwargs):
super(mtSpider, self).__init__(*args, **kwargs)
self.keywordQueue = keywordQueue
self.excelPath = gexcelPwd + "data_" + getCurrentTime() + "_" + str(index) + ".xls"
def writeExcel(self, data):
print("-" * 10)
print(data)
print("-" * 10)
try:
workbook = xlrd.open_workbook(self.excelPath)
sheets = workbook.sheet_names()
worksheet = workbook.sheet_by_name(sheets[0])
rows_old = worksheet.nrows
new_workbook = copy(workbook)
new_worksheet = new_workbook.get_sheet(0)
for j in range(0, len(data)):
try:
new_worksheet.write(rows_old, j, str(data[j]))
except Exception as e:
continue
new_workbook.save(self.excelPath)
except Exception as e:
pass
def getGoodsList(self, keyword, page):
sign = getSign(page)
url = "https://www.mtzh.ltd/api/all/AllProduct/PostSearchNew?sign=" + sign + "&word=" + str(keyword) + "&OpenID=" + OpenID
data = {
"sign": sign,
"word": keyword,
"OpenID": OpenID,
}
res = postHtml(url, data)
try:
return res['Data']
except Exception as e:
pass
def run(self):
self.initExcel()
while True:
if self.keywordQueue.empty():
break
keyword = self.keywordQueue.get()
crawlerNum = keyword['crawlerNum']
currNums = 0
page = 1
stop = False
while True:
goodsList = self.getGoodsList(keywords, page)
if goodsList and len(goodsList) > 0:
for goods in goodsList:
try:
data = []
ModifyDate = ""
ShopNumber = ""
ShopName = ""
try:
ModifyDate = goods['ModifyDate']
except Exception as e:
pass
try:
ShopName = goods['ShopName']
except Exception as e:
pass
data.append(ModifyDate)
data.append(ShopNumber + "\n" + ShopName)
self.writeExcel(data)
currNums += 1
if currNums >= crawlerNum:
stop = True
break
except Exception as e:
pass
if stop:
break
page += 1
time.sleep(5)
else:
break
def getKeywordsQueue():
keywordQueue = Queue(0)
try:
fs = os.listdir(excelPwd)
try:
for f in fs:
try:
tpath = excelPwd + f
df = pds.read_excel(tpath, encoding="utf-8")
rows = df.iterrows()
for row in rows:
try:
rowData = row[1]
keywords = rowData['关键词']
crawlerNum = 1000
try:
crawlerNum = int(rowData['采集数量']) if int(rowData['采集数量']) > 0 else 1000
except Exception as e:
crawlerNum = 1000
if not pds.isnull(keywords):
keywordQueue.put({
"keyword": keywords, "crawlerNum": crawlerNum})
except Exception as e:
pass
except Exception as e:
pass
except Exception as e:
pass
except Exception as e:
pass
return keywordQueue
def main():
global threadNums
keywordLen = keywordQueue.qsize()
if keywordLen > 0:
for i in range(threadNums):
m = mtSpider(keywordQueue, i)
m.start()
else:
print("未读取到任何excel关键词,请检查excel是否规范!")
if __name__ == '__main__':
main()
转载:https://blog.csdn.net/qq_41287993/article/details/111223639
查看评论