飞道的博客

Python爬取阿里巴巴商城数据

1039人阅读  评论(0)

目录

1.前言

2、解决方案

3、现在开始上代码实现

4、最后总结:

                            我是政胤 期待你的关注



1.前言

大家好 我是每天走在刑的第一线的政胤

今天教大家获取阿里巴巴的列表页商品信息包含,商品title,商品主图片并且需要存入xls文件保存  我是政胤 制作不易点个免费的关注吧

2、解决方案

     首先给出的方案是:

     2.1、通过wxPython框架写出一个可视化界面,

     2.2、因为阿里巴巴防爬比较严重,所以我直接通过selenium进行用户超过来跳过反扒机制

     2.3、编写浏览器池方便实现多线程爬取数据

     2.4、编写爬数据业务逻辑

3、现在开始上代码实现

     3.1 首先初始先一个浏览器池子


  
  1. from multiprocessing  import Manager
  2. from time  import sleep
  3. from tool.open_browser  import open_browser
  4. class  DriverPool:
  5.      def  __init__( self, max_nums,driver_path,ui,open_headless=0):
  6.         self.ui = ui
  7.         self.drivers = {}
  8.         self.manager = Manager()
  9.         self.queue = self.manager.Queue()
  10.         self.max_nums = max_nums
  11.         self.open_headless = open_headless
  12.         self.CreateDriver(driver_path)
  13.      def  CreateDriver( self,driver_path):
  14.          '''
  15.         初始化浏览器池
  16.         :return
  17.         '''
  18.          for name  in  range( 1, self.max_nums +  1):
  19.             name =  f'driver_{name}'
  20.             d = open_browser(excute_path=driver_path,open_headless=self.open_headless)
  21.             d.ui = self.ui
  22.             self.drivers[name] = d
  23.             self.queue.put(name)
  24.      def  getDriver( self):
  25.          '''
  26.         获取一个浏览器
  27.         :return driver
  28.         '''
  29.          if self.queue.empty():
  30.             sleep( 1)
  31.              return self.getDriver()
  32.         name = self.queue.get()
  33.         driver = self.drivers[name]
  34.         driver.pool_name_driver = name
  35.          return driver
  36.      def  putDriver( self, name):
  37.          '''
  38.         归还一个浏览器
  39.         :param name: 
  40.         :return: 
  41.         '''
  42.         self.queue.put(name)
  43.      def  quit( self):
  44.          '''
  45.         关闭浏览器,执行结束操作
  46.         :return: 
  47.         '''
  48.          if self.drivers:
  49.              for driver  in self.drivers.values():
  50.                  try:
  51.                     driver.quit()
  52.                  except:
  53.                      pass

     3.2 编写UI操作界面


  
  1.      def  intUIRun( self):
  2.          '''
  3.         初始化UI主界面
  4.         :return:
  5.         '''
  6.         pannel = wx.Panel(self.panel_run)
  7.         pannel.Sizer = wx.BoxSizer(wx.VERTICAL)
  8.         self.text = wx.StaticText(pannel, - 1'状态栏目:', size=( 10040), pos=( 010))
  9.         self.text_input = wx.StaticText(pannel, - 1'', size=( 90040), pos=( 1000))
  10.         wx.StaticText(pannel, - 1'当前执行ID:', size=( 10030), pos=( 065)).SetFont(self.font)
  11.         self.text_time = wx.TextCtrl(pannel,  id=self.choices_id_ref, value=self.time_str, size=( 30030), pos=( 15060),
  12.                                      style=wx.TE_AUTO_URL | wx.TE_MULTILINE)
  13.         self.reflush_text_time = wx.Button(pannel, - 1'刷新ID', size=( 10050), pos=( 48050))
  14.         self.text_time.SetFont(self.font)
  15.         self.reflush_text_time.SetForegroundColour(wx.RED)
  16.         self.reflush_text_time.SetFont(self.font)
  17.          # self.text_time.SetForegroundColour(wx.RED)
  18.         self.text_input.SetBackgroundColour(wx.WHITE)
  19.         self.text_input.SetLabel(self.in_text)
  20.         self.text_input.SetFont(self.font)
  21.         self.text.SetFont(self.font)
  22.         wx.Button(pannel, self.get_product,  '获取商品保存本地', size=( 200100), pos=( 0100)).SetFont(self.font)
  23.         wx.Button(pannel, self.save_mysql,  '保存数据库和OSS', size=( 200100), pos=( 200100)).SetFont(self.font)
  24.         wx.Button(pannel, self.end_process,  '结束执行', size=( 200100), pos=( 400100)).SetFont(self.font)
  25.         self.log_text = wx.TextCtrl(pannel, size=( 1000500), pos=( 0210), style=wx.TE_MULTILINE | wx.TE_READONLY)
  26.         wx.LogTextCtrl(self.log_text)
  27.         self.Bind(wx.EVT_BUTTON, self.get_product_p,  id=self.get_product)
  28.         self.Bind(wx.EVT_BUTTON, self.save_mysql_p,  id=self.save_mysql)
  29.         self.Bind(wx.EVT_BUTTON, self.end_process_p,  id=self.end_process)
  30.         self.text_time.Bind(wx.EVT_COMMAND_LEFT_CLICK, self.choices_id,  id=self.choices_id_ref)
  31.         self.reflush_text_time.Bind(wx.EVT_BUTTON, self.reflush_time_evt)
  32.         self.panel_run.Sizer.Add(pannel, flag=wx.ALL | wx.EXPAND, proportion= 1)

     效果图

     3.3编写业务逻辑

     获取商品列表页数据


  
  1. global _getMainProduct, goods_info
  2. def  _getMainProduct( data_url):
  3.      '''
  4.     多线程获取每一页链接
  5.     :param data_url:
  6.     :return:
  7.     '''
  8.     self, url, driver_pool = data_url
  9.     c = Common(driver_pool.getDriver())
  10.     goods_urls = []
  11.      try:
  12.         self.ui. print( f'当前获取第{url}页数据')
  13.         c.d.get(url)
  14.         c.wait_page_loaded(url)
  15.          if self.is_load_cache_cookies:
  16.             self.load_cookies(c.d)
  17.             c.d.get(url)
  18.         c.wait_page_loaded(url)
  19.         ele = c.find_element(By.CSS_SELECTOR,  '[class="component-product-list"]')
  20.         goods_urls = ele.find_elements(By.CSS_SELECTOR,  'a[class="product-image"]')
  21.         goods_urls = [goods_url.get_attribute( 'href'for goods_url  in goods_urls]
  22.      except SystemExit:
  23.         sys.exit( 1)
  24.      except:
  25.         self. print( f'请求页面超出范围: {url} ERROR: {traceback.format_exc()}')
  26.          if c.find_element_true(By.CSS_SELECTOR,  '[class="no-data common"]'):
  27.              return goods_urls
  28.      finally:
  29.         name = c.d.pool_name_driver
  30.         driver_pool.putDriver(name)
  31.         self.queue_print.put( f'请求完成:{url}')
  32.      return goods_urls
  33. def  getMainProduct_( self):
  34.     g_dict =  globals()
  35.     urls = []
  36.     sum_l = self.pageNums[ 1] +  1
  37.     complate =  0
  38.     products = []
  39.      for i  in  range(self.pageNums[ 0], sum_l):
  40.          if self.ui.is_exit_process:
  41.             exit()
  42.         url = self.url. format(i)
  43.         urls.append([self, url, self.drive_pool])
  44.      if urls:
  45.         p = self.pool.map_async(_getMainProduct, urls)
  46.          while  not p.ready():
  47.              if  not self.queue_print.empty():
  48.                 complate +=  1
  49.                 self. print(self.queue_print.get(),  f'完成:{complate}/{sum_l - 1}')
  50.         products = p.get()
  51.     goods_info =  set()
  52.      for xx  in products:
  53.          for x  in xx:
  54.              if x:
  55.                 goods_info.add(x)
  56.     self.goods_info = goods_info
  57.      return goods_info
  58. goods_info = getMainProduct_(self)

     获取详情页数据


  
  1. global goods,Common,driver_pool,goods_url,sleep,re,By
  2. def  get_info_( self, data_info):
  3.      '''
  4.     多线程获取详情页数据
  5.     :param self: 
  6.     :param data_info: 
  7.     :return: 
  8.     '''
  9.      if self.ui.is_exit_process:
  10.         exit()
  11.     goods_url, driver_pool = data_info
  12.     c = Common(driver_pool.getDriver())
  13.      try:
  14.         c.d.get(goods_url)
  15.         sleep( 3)
  16.          if self.is_load_cache_cookies:
  17.             self.load_cookies(c.d)
  18.             c.d.get(goods_url)
  19.         c.wait_page_loaded(goods_url)
  20.          for x  in  range( 40018000200):
  21.             sleep( 0.1)
  22.             c.d.execute_script( f'document.documentElement.scrollTop={x};')
  23.         is_all = c.find_element_true(By.CSS_SELECTOR,  '[id="J-rich-text-description"]')   # 'J-rich-text-description'
  24.          if  not is_all:
  25.             self. print( f'没有发现: {is_all}')
  26.         is_video = c.find_elements_true(By.CSS_SELECTOR,  '[class="bc-video-player"]>video')
  27.         is_title = c.find_element_true(By.CSS_SELECTOR,  '[class="module-pdp-title"]')
  28.         is_description = c.find_element_true(By.CSS_SELECTOR,  '[name="description"]')
  29.         is_keywords = c.find_element_true(By.CSS_SELECTOR,  '[name="keywords"]')
  30.         is_overview = c.find_element_true(By.CSS_SELECTOR,  '[class="do-overview"]')
  31.         is_wz_goods_cat_id = c.find_element_true(By.CSS_SELECTOR,  '[class="detail-subscribe"]')
  32.         wz_goods_cat_id = self.wz_goods_cat_id
  33.          # if is_wz_goods_cat_id:
  34.          #     wz_goods_cat_id = is_wz_goods_cat_id.find_elements(By.CSS_SELECTOR, '[class="breadcrumb-item"]>a')[
  35.          #         -1].get_attribute('href')
  36.          #     wz_goods_cat_id = re.search(r'(\d+)', wz_goods_cat_id).group(1)
  37.          # goods_id = re.search(r'(\d+)\.html$', goods_url)
  38.         goods_id = re.search( r'(ssssss\d+)\.html$', goods_url)
  39.         goods = {
  40.              "商品分类ID"int(wz_goods_cat_id)  if wz_goods_cat_id  else  0,
  41.              "商品ID": goods_id.group( 1if goods_id  else self.getMd5( f'{time.time()}')+ '其他',
  42.              "商品链接": goods_url,
  43.              "描述": c.find_element(By.CSS_SELECTOR,  '[name="description"]').get_attribute(
  44.                  'content'if is_description  else  '',
  45.              "标题": is_title.get_attribute( 'title'if is_title  else  '',
  46.              "关键字": c.find_element(By.CSS_SELECTOR,  '[name="keywords"]').get_attribute(
  47.                  'content'if is_keywords  else is_keywords,
  48.              "视频连接": c.find_element(By.CSS_SELECTOR,  '[class="bc-video-player"]>video').get_attribute(
  49.                  'src'if is_video  else  '',
  50.              "主图片": [],
  51.              "商品详情": c.d.execute_script(
  52.                  '''return document.querySelectorAll('[class="do-overview"]')[0].outerHTML;'''if is_overview  else is_overview,
  53.              "商品描述"'',
  54.              "商品描述图片": []
  55.         }
  56.          # 获取商品描述图片
  57.         goods_desc = getDescriptionFactory1(self, c, goods_url)
  58.         goods.update(goods_desc)
  59.          # 获取主图片
  60.         m_imgs = c.find_elements(By.CSS_SELECTOR,  '[class="main-image-thumb-ul"]>li')
  61.          for m_img  in m_imgs:
  62.              try:
  63.                 img = m_img.find_element(By.CSS_SELECTOR,  '[class="J-slider-cover-item"]').get_attribute( 'src')
  64.                 s = re.search( '(\d+x\d+)', img)
  65.                 img2 =  None
  66.                  if s:
  67.                     img2 =  str(img).replace(s.group( 1),  '')
  68.                 goods[ '主图片'].append(img)
  69.                  if img2:
  70.                     goods[ '主图片'].append(img2)
  71.              except:
  72.                  pass
  73.         self.ui.status[ '请求成功商品数量'] +=  1
  74.          return goods
  75.      except:
  76.         traceback.print_exc()
  77.         self. print( f'=========================\n链接请求错误: {goods_url} \n {traceback.format_exc()}\n=========================')
  78.         self.error_page.append([goods_url, traceback.format_exc()])
  79.         self.ui.status[ '请求失败商品数量'] +=  1
  80.      finally:
  81.         name = c.d.pool_name_driver
  82.         driver_pool.putDriver(name)
  83.         self.queue_print.put( f'请求完成:{goods_url}')
  84. goods = get_info_(self,data_info)

     写入excel


  
  1.          def  export_excel( self, results):
  2.          '''
  3.         写入excel方法
  4.         :param results: 
  5.         :return: 
  6.         '''
  7.         now_dir_str = self.now
  8.         now_file_str = time.strftime( '%Y_%m_%d__%H_%M_%S', time.localtime())
  9.         img_path = os.path.join( 'data''xls', now_dir_str)
  10.          if  not os.path.exists(img_path):
  11.             os.mkdir(img_path)
  12.         img_path = os.path.join( 'data''xls', now_dir_str, self.url_id)
  13.          if  not os.path.exists(img_path):
  14.             os.mkdir(img_path)
  15.          if  not os.path.exists(img_path):
  16.             os.mkdir(img_path)
  17.         img_path = os.path.join(img_path,  f"{now_file_str}.xlsx")
  18.         workbook = xlsxwriter.Workbook(img_path)
  19.         sheet = workbook.add_worksheet(name= '阿里巴巴信息')
  20.         titles =  list(results[ 0].keys())
  21.          for i, title  in  enumerate(titles):
  22.             sheet.write_string( 0, i, title)
  23.          for row, result  in  enumerate(results):
  24.             row = row +  1
  25.             col =  0
  26.              for value  in result.values():
  27.                 sheet.write_string(row, col,  str(value))
  28.                 col +=  1
  29.         workbook.close()

4、最后总结:

    由于通用selenium执行浏览器操作没有接口请求效率高,所以在最后使用了多线程在执行效率上也做了一些提升。

                            我是政胤 期待你的关注


转载:https://blog.csdn.net/m0_69043821/article/details/125491644
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场