pyhton爬取爱豆（李易峰）微博评论（附源码）_小言_互联网的博客

pyhton爬取爱豆（李易峰）微博评论（附源码）

2020-11-28 08:56 754人阅读评论(0)

今日目标：微博

以李易峰的微博为例：

https://weibo.com/liyifeng2007?is_all=1

然后进入评论页面，进入XHR查找真是地址：

https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4353796790279702&from=singleWeiBo

很明显，是动态的，抓取也是按我以前写的那些方法来，就不一一说了，他这里最重要的还是那串数字，所以我们只要在第一个网址哪里把那串数字找出来就算成功一半了，这次需要用到re正则，嗯，这个我不擅长，不过没事，应该还是可以搞到的：


   
    
     
      
     
     
      
       target = 
       'https://weibo.com/liyifeng2007?is_all=1'
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       headers = {
      
     
    
     
      
     
     
          
       'user-agent': 
       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
      
     
    
     
      
     
     
          
       'cookie': 
       'SUB=_2AkMowDDgf8NxqwJRmPoSyWnqao53ywzEieKenME7JRMxHRl-yT9kqnEjtRB6A0AeDzsLF_aeZGlWOMf4mEl-MBZZXqc_; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWcpq860VQlJcIRRNP9pzqS; SINAGLOBAL=1033839351635.7524.1604108279474; login_sid_t=c071efc77911ceace152df2be5986e09; cross_origin_proto=SSL; WBStorage=8daec78e6a891122|undefined; _s_tentry=-; Apache=8275565331127.246.1604195643561; ULV=1604195643568:3:1:1:8275565331127.246.1604195643561:1604122447982; wb_view_log=1920*10801; UOR=,,editor.csdn.net'
      
     
    
     
      
     
     
      
       }
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       html = requests.get(target,headers=headers).text
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       for 
       each in re.findall(
       '<a name=(.*?)date=',html):
      
     
    
     
      
     
     
      
           real_id = each.
       split(
       " ")[
       0]
      
     
    
     
      
     
     
      
           filename = each.
       split(
       "\\")[-
       2].replace(
       '"',
       "").replace(
       ":",
       ".")
      
     
    
     
      
     
     
          
       print(real_id,filename)

输出如下：

第一个就是我们需要的ID，后面则是发微博的时间，我们用它来做存储评论数据的文件名称。

然后我们把ID传入第二个网址：


   
    
     
      
     
     
      
       comment_url = f'https://weibo.com/aj/v6/
       comment/
       big?ajwvr=
       6&
       id={real_id}&
       from=singleWeiBo
       '

当然这个是抓取热度的，如你要抓取最新回复的，需要下面这个：


   
    
     
      
     
     
      
       comment_url = f
       'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={read_id}&page=1'

拿到这个就简单了，JSON 数据，直接进json网站解析就行，然后找到我们需要的数据，这里就直接上代码了：


   
    
     
      
     
     
      
       comment_url = f
       'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={real_id}&page=1'
      
     
    
     
      
     
     
      
       res = requests.get(comment_url,headers=headers).json()[
       "data"][
       "html"]
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       # 提取评论人和评论内容
      
     
    
     
      
     
     
      
       conmment = re.findall(
       'ucardconf="type=1">(.*?)</div>', res)
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       for 
       each in conmment:
      
     
    
     
      
     
     
      
             
       # 将 内容里的那些表情替换
      
     
    
     
      
     
     
      
             
       each = re.sub('<.*?>','',
       each)
      
     
    
     
      
     
     
      
             
       print(
       each)

re 不太会用，大家将就着看，主要是能把数据搞到手，这个最重要，哈哈…

对比一下：

把那些表情给去除了，有些只发表情没法字的就会只显示名字，这个是正常的，其余就是一毛一样了。
数据拿到了，咱们就存储到本地吧，全部代码：


   
    
     
      
     
     
      
       # -*- coding: utf-8 -*-
      
     
    
     
      
     
     
      
       """
      
     
    
     
      
     
     
      
       Created on 2020-11-18
      
     
    
     
      
     
     
      
       
      
     
    
     
      
     
     
      
       
      
     
    
     
      
     
     
      
       @author: 李运辰
      
     
    
     
      
     
     
      
       """
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       #https://weibo.com/liyifeng2007?is_all=1
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       import requests
      
     
    
     
      
     
     
      
       import re,os
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       url = 
       'https://s.weibo.com/?topnav=1&wvr=6'
      
     
    
     
      
     
     
      
       target = 
       'https://weibo.com/liyifeng2007?is_all=1'
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       headers = {
      
     
    
     
      
     
     
          
       'user-agent': 
       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
      
     
    
     
      
     
     
          
       'cookie': 
       'SUB=_2AkMowDDgf8NxqwJRmPoSyWnqao53ywzEieKenME7JRMxHRl-yT9kqnEjtRB6A0AeDzsLF_aeZGlWOMf4mEl-MBZZXqc_; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWcpq860VQlJcIRRNP9pzqS; SINAGLOBAL=1033839351635.7524.1604108279474; login_sid_t=c071efc77911ceace152df2be5986e09; cross_origin_proto=SSL; WBStorage=8daec78e6a891122|undefined; _s_tentry=-; Apache=8275565331127.246.1604195643561; ULV=1604195643568:3:1:1:8275565331127.246.1604195643561:1604122447982; wb_view_log=1920*10801; UOR=,,editor.csdn.net'
      
     
    
     
      
     
     
      
       }
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       html = requests.get(target,headers=headers).text
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
      
       for each 
       in re.findall(
       '<a name=(.*?)date=',html):
      
     
    
     
      
     
     
      
           real_id = each.split(
       " ")[
       0]
      
     
    
     
      
     
     
      
           filename = each.split(
       "\\")[
       -2].replace(
       '"',
       "").replace(
       ":",
       ".")
      
     
    
     
      
     
     
          
       # print(real_id,filename)
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
          
       # print(filename)
      
     
    
     
      
     
     
          
       for page 
       in range(
       1,
       11):
      
     
    
     
      
     
     
      
               comment_url = 
       f'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={real_id}&page={page}'
      
     
    
     
      
     
     
      
               res = requests.get(comment_url,headers=headers).json()[
       "data"][
       "html"]
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
       
      
     
    
     
      
     
     
              
       # 提取评论人和评论内容
      
     
    
     
      
     
     
      
               conmment = re.findall(
       'ucardconf="type=1">(.*?)</div>', res)
      
     
    
     
      
     
     
              
       # conmment = re.findall('</i></a>(.*?) </div>', res)
      
     
    
     
      
     
     
              
       for each 
       in conmment:
      
     
    
     
      
     
     
                  
       # 将 内容里的那些表情替换
      
     
    
     
      
     
     
      
                   each = re.sub(
       '<.*?>',
       '',each)
      
     
    
     
      
     
     
      
                   print(each)
      
     
    
     
      
     
     
      
                   f_name = 
       "./images/"+filename
      
     
    
     
      
     
     
                  
       with open(f_name+
       "_李运辰.txt",
       "a",encoding=
       "utf-8") 
       as f:
      
     
    
     
      
     
     
      
                       f.write(each)
      
     
    
     
      
     
     
      
                       f.write(
       "\n")