微博爬虫，python微博用户主页小姐姐图片内容采集爬虫_小言_互联网的博客

微博爬虫，python微博用户主页小姐姐图片内容采集爬虫

2020-11-28 08:33 624人阅读评论(0)

python爬虫，微博爬虫，需要知晓微博用户id号，能够通过抓取微博用户主页内容来获取用户发表的内容，时间，点赞数，转发数等数据，当然以上都是本渣渣结合网上代码抄抄改改获取的！

要抓取的微博地址：https://weibo.com/u/5118612601

BUT，我们实际应用的抓取地址：https://m.weibo.cn/u/5118612601（移动端的微博地址）

LSP的最爱，各种小姐姐，随你任意爬取，快收藏起来啊！

通过浏览器抓包，我们可以获悉几个比较重要的参数：


   
    
     
      
     
     
      
       type: uid
      
     
    
     
      
     
     
      
       value: 
       5118612601
      
     
    
     
      
     
     
      
       containerid: 
       1005055118612601

其实还有一个比较重要的参数，那就是翻页：'page':page！

还有一个SSL错误问题，大家可以自行处理！


   
    
     
      
     
     
      
       import logging
      
     
    
     
      
     
     
      
       logging.captureWarnings(True)
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       # 屏蔽warning信息
      
     
    
     
      
     
     
      
       requests.packages.urllib3.disable_warnings()
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       html=requests.get(self.url,headers=self.headers,params=params,timeout=
       5,verify=False).content.decode(
       'utf-8')

几个关键点

获取 containerid 参数


   
    
     
      
     
     
      
           def get_containerid(self):
      
     
    
     
      
     
     
      
               url = f
       'https://m.weibo.cn/api/container/getIndex?type=uid&value={self.uid}'
      
     
    
     
      
     
     
      
               data = requests.get(url,headers=self.headers,timeout=
       5,verify=False).content.decode(
       'utf-8')
      
     
    
     
      
     
     
      
               content = json.loads(data).get(
       'data')
      
     
    
     
      
     
     
      
               
       for data in content.get(
       'tabsInfo').get(
       'tabs'):
      
     
    
     
      
     
     
      
                   
       if (data.get(
       'tab_type') == 
       'weibo'):
      
     
    
     
      
     
     
      
                       containerid = data.get(
       'containerid')
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
               self.containerid=containerid

获取微博用户发表数据


   
    
     
      
     
     
      
           def get_content(self,i):
      
     
    
     
      
     
     
      
               params={
      
     
    
     
      
     
     
      
                   
       'type': 
       'uid',
      
     
    
     
      
     
     
      
                   
       'value': self.uid,
      
     
    
     
      
     
     
      
                   
       'containerid': self.containerid,
      
     
    
     
      
     
     
      
                   
       'page':i,
      
     
    
     
      
     
     
      
               }
      
     
    
     
      
     
     
      
               html=requests.get(self.url,headers=self.headers,params=params,timeout=
       5,verify=False).content.decode(
       'utf-8')
      
     
    
     
      
     
     
      
               data=json.loads(html)[
       'data']
      
     
    
     
      
     
     
      
               cards=data[
       'cards']
      
     
    
     
      
     
     
      
               #
       print(cards)
      
     
    
     
      
     
     
      
               j = 
       1
      
     
    
     
      
     
     
      
               
       for card in cards:
      
     
    
     
      
     
     
      
                   
       if 
       "mblog" in str(card):
      
     
    
     
      
     
     
      
                       mblog = card[
       'mblog']
      
     
    
     
      
     
     
      
                       raw_text = mblog[
       'raw_text']  # 文本内容
      
     
    
     
      
     
     
      
                       
       print(raw_text)
      
     
    
     
      
     
     
      
                       scheme=card[
       'scheme'] #微博链接
      
     
    
     
      
     
     
      
                       attitudes_count = mblog.get(
       'attitudes_count') #点赞数
      
     
    
     
      
     
     
      
                       comments_count = mblog.get(
       'comments_count') #评论数
      
     
    
     
      
     
     
      
                       created_at = mblog.get(
       'created_at') #发布时间
      
     
    
     
      
     
     
      
                       reposts_count = mblog.get(
       'reposts_count') #转发数
      
     
    
     
      
     
     
      
                       
       print(scheme)
      
     
    
     
      
     
     
      
                       img_path=f
       '{self.path}{i}/{j}'
      
     
    
     
      
     
     
      
                       os.makedirs(f
       '{img_path}/',exist_ok=True)
      
     
    
     
      
     
     
      
                       with open(f
       '{img_path}/{j}.txt', 
       'a', encoding=
       'utf-8') as f:
      
     
    
     
      
     
     
      
                           f.write(f
       '{raw_text}')
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
                       img_urls=[]
      
     
    
     
      
     
     
      
                       
       if mblog.get(
       'pics') != None:
      
     
    
     
      
     
     
      
                           img_datas=mblog[
       'pics']
      
     
    
     
      
     
     
      
                           
       for img_data in img_datas:
      
     
    
     
      
     
     
      
                               img_url=img_data[
       'large'][
       'url']
      
     
    
     
      
     
     
      
                               img_urls.
       append(img_url)
      
     
    
     
      
     
     
      
                           
       print(img_urls)
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
                           #多线程下载图片
      
     
    
     
      
     
     
      
                           self.get_imgs(img_urls,img_path)
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
                           #多进程下载图片
      
     
    
     
      
     
     
      
                           #self.get_pimgs(img_urls)
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
                       with open(f
       '{self.uid}/{self.uid}.txt', 
       'a', encoding=
       'utf-8') as fh:
      
     
    
     
      
     
     
      
                           fh.write(
       "----第" + str(i) + 
       "页，第" + str(j) + 
       "条微博----" + 
       "\n")
      
     
    
     
      
     
     
      
                           fh.write(f
       "微博地址： {str(scheme)}\n微博内容：{raw_text}\n"
      
     
    
     
      
     
     
      
                                    f
       "发布时间：{str(created_at)}\n转发数：{str(reposts_count)}\n"
      
     
    
     
      
     
     
      
                                    f
       "点赞数：{str(attitudes_count)}\n评论数：{str(comments_count)}\n\n")
      
     
    
     
      
     
     
      
                       j=j+
       1
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
                       time.sleep(
       2)

多线程下载图片


   
    
     
      
     
     
      
           #多线程下载图片
      
     
    
     
      
     
     
      
           def get_imgs(self,img_urls,img_path):
      
     
    
     
      
     
     
      
               threadings = []
      
     
    
     
      
     
     
      
               
       for img_url in img_urls:
      
     
    
     
      
     
     
      
                   t = threading.Thread(target=self.get_img, args=(img_url,img_path))
      
     
    
     
      
     
     
      
                   threadings.
       append(t)
      
     
    
     
      
     
     
      
                   t.start()
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
               
       for x in threadings:
      
     
    
     
      
     
     
      
                   x.join()
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
               
       print(
       "多线程下载图片完成")
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
           def get_img(self, img_url,img_path):
      
     
    
     
      
     
     
      
               img_name = img_url.split(
       '/')[
       -1]
      
     
    
     
      
     
     
      
               
       print(f
       '>> 正在下载图片：{img_name} ..')
      
     
    
     
      
     
     
      
               r = requests.get(img_url, timeout=
       8, headers=self.headers,verify=False)
      
     
    
     
      
     
     
      
               with open(f
       '{img_path}/{img_name}', 
       'wb') as f:
      
     
    
     
      
     
     
      
                   f.write(r.content)
      
     
    
     
      
     
     
      
               
       print(f
       '>> 图片：{img_name} 下载完成！')