小言_互联网的博客

Python爬虫系列之明通市场数据爬取

197人阅读  评论(0)

Python爬虫系列之明通市场数据爬取

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发> 点击这里联系我们 <

微信请扫描下方二维码

代码仅供学习交流,请勿用于非法用途,加密算法不提供,仅供参考学习

直接上代码

import requests
import json
import time
import configparser
from queue import Queue
import os
import xlrd
import urllib.parse
import xlwt

'''
	代码仅供学习,请勿异常使用
'''

headers = {
   
    "Content-Type": "application/x-www-form-urlencoded",
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.15(0x17000f31) NetType/WIFI Language/zh_CN",
}
excelTitle = ["日期", "产品报价", "铺位"]
excelPwd = os.getcwd() + "/excels/"
if not os.path.exists(gexcelPwd):
    os.mkdir(gexcelPwd)
cf = configparser.ConfigParser()
try:
    cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")
except Exception as e:
    print("程序目录下不存在conf.ini配置文件~")
    exit(0)


def getConf(sec, key):
    try:
        return cf.get(sec, key)
    except Exception as e:
        print(e)
        print("未得到以下配置:" + sec + " - " + key)
        exit(0)


threadNums = 1
try:
    threadNums = int(getConf("app-sys", "threadNums"))
    if threadNums <= 0:
        threadNums = 1
except Exception as e:
    threadNums = 1


def postHtml(url, data):
    for i in range(3):
        try:
            resp = requests.post(url, data=data, headers=headers)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            pass


def getSign(page):
    while True:
        try:
            resp = os.popen('node encrypt.js ' + str(page))
            return resp.buffer.read()
        except Exception as e:
            pass


def getCurrentTime():
    return str(time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())))


class mtSpider(threading.Thread):
    def __init__(self, keywordQueue, index, *args, **kwargs):
        super(mtSpider, self).__init__(*args, **kwargs)
        self.keywordQueue = keywordQueue
        self.excelPath = gexcelPwd + "data_" + getCurrentTime() + "_" + str(index) + ".xls"

    def writeExcel(self, data):
        print("-" * 10)
        print(data)
        print("-" * 10)
        try:
            workbook = xlrd.open_workbook(self.excelPath)
            sheets = workbook.sheet_names()
            worksheet = workbook.sheet_by_name(sheets[0])
            rows_old = worksheet.nrows
            new_workbook = copy(workbook)
            new_worksheet = new_workbook.get_sheet(0)
            for j in range(0, len(data)):
                try:
                    new_worksheet.write(rows_old, j, str(data[j]))
                except Exception as e:
                    continue
            new_workbook.save(self.excelPath)
        except Exception as e:
            pass

    def getGoodsList(self, keyword, page):
        sign = getSign(page)
        url = "https://www.mtzh.ltd/api/all/AllProduct/PostSearchNew?sign=" + sign + "&word=" + str(keyword) + "&OpenID=" + OpenID
        data = {
   
            "sign": sign,
            "word": keyword,
            "OpenID": OpenID,
        }
        res = postHtml(url, data)
        try:
            return res['Data']
        except Exception as e:
            pass

    def run(self):
        self.initExcel()
        while True:
            if self.keywordQueue.empty():
                break
            keyword = self.keywordQueue.get()
            crawlerNum = keyword['crawlerNum']
            currNums = 0
            page = 1
            stop = False
            while True:
                goodsList = self.getGoodsList(keywords, page)
                if goodsList and len(goodsList) > 0:
                    for goods in goodsList:
                        try:
                            data = []
                            ModifyDate = ""
                            ShopNumber = ""
                            ShopName = ""
                            try:
                                ModifyDate = goods['ModifyDate']
                            except Exception as e:
                                pass
                            try:
                                ShopName = goods['ShopName']
                            except Exception as e:
                                pass
                            data.append(ModifyDate)
                            data.append(ShopNumber + "\n" + ShopName)
                            self.writeExcel(data)
                            currNums += 1
                            if currNums >= crawlerNum:
                                stop = True
                                break
                        except Exception as e:
                            pass
                    if stop:
                        break
                    page += 1
                    time.sleep(5)
                else:
                    break


def getKeywordsQueue():
    keywordQueue = Queue(0)
    try:
        fs = os.listdir(excelPwd)
        try:
            for f in fs:
                try:
                    tpath = excelPwd + f
                    df = pds.read_excel(tpath, encoding="utf-8")
                    rows = df.iterrows()
                    for row in rows:
                        try:
                            rowData = row[1]
                            keywords = rowData['关键词']
                            crawlerNum = 1000
                            try:
                                crawlerNum = int(rowData['采集数量']) if int(rowData['采集数量']) > 0 else 1000
                            except Exception as e:
                                crawlerNum = 1000
                            if not pds.isnull(keywords):
                                keywordQueue.put({
   "keyword": keywords, "crawlerNum": crawlerNum})
                        except Exception as e:
                            pass
                except Exception as e:
                    pass
        except Exception as e:
            pass
    except Exception as e:
        pass
    return keywordQueue


def main():
    global threadNums
    keywordLen = keywordQueue.qsize()
    if keywordLen > 0:
        for i in range(threadNums):
            m = mtSpider(keywordQueue, i)
            m.start()
    else:
        print("未读取到任何excel关键词,请检查excel是否规范!")


if __name__ == '__main__':
    main()

转载:https://blog.csdn.net/qq_41287993/article/details/111223639
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场