爬取肖战微博评论_飞道的博客

爬取肖战微博评论

2020-06-22 07:56 633人阅读评论(0)

我们公司的小姐姐很多都是肖战的铁粉，我就想看看肖战有哪些吸引小姐姐的地方，看看里面是不是有摆脱我单身的痛。

话不多说我们先看看肖战的微博

我们以第一篇微博为例子，点击进去，按F12，进入开发者选项，按照下图操作，这是一个js文件，这个文件里面是评论数据。我们要抓取评论数据，首先就要构建这个url。

通过分析url我们可以知道，在上一页的评论里面我们可以找到下一级root_comment_max_id，这样我们就可以构建url了。

https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4492118670464947&from=singleWeiBo&__rnd=1592459762324

主要的元素如下：

ajwvr：6常量
id：4492118670464947 微博id
root_comment_max_id：父级id
from：singleWeiBo
rnd：当前时间减去1970.01.01，时间差转换为毫秒

代码如下：

headers = { 'Cookie': 填写自己的cookies}
params = {
        'ajwvr': 6,
        'id': '4492118670464947',
        'from': 'singleWeiBo',
        'root_comment_max_id': ''
    }
URL = 'https://weibo.com/aj/v6/comment/big'
response = requests.get(URL, params=params, headers=headers)
response  = json.loads(resp.text)
html = response ['data']['html']
html = etree.HTML(html)
print(etree.tostring(html, encoding=str, pretty_print=True))
max_id_json = html.xpath('//div[@node-type="comment_loading"]/@action-data')[0]
node_params = parse_qs(max_id_json)
 # 获取父级ID
max_id = node_params['root_comment_max_id'][0]
params['root_comment_max_id'] = max_id

爬取评论以及用户ID

爬取结果

完整代码

import json
import requests
import time
import random
from lxml import etree
import lxml
import os
from urllib.parse import parse_qs
from bs4 import BeautifulSoup

#cookies
headers = {
    'Cookie': '填写cookies'}

#当前路径+pic
pic_file_path = os.path.join(os.path.abspath(''), 'picture')

def ipget():
    proxies=[]
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/72.0.3626.121 Safari/537.36'}
    # 发送网络请求
    response = requests.get('https://www.xicidaili.com/nn/', headers=headers)
    response.encoding = 'utf-8'  # 设置编码方式
    if response.status_code == 200:  # 判断请求是否成功
        html = etree.HTML(response.text)  # 解析HTML
        table = html.xpath('//table[@id="ip_list"]')[0]  # 获取table标签内容
        trs = table.xpath('//tr')[1:]  # 获取所有tr标签，排除第一条
        # 循环遍历标签内容
        for t in trs:
            clas = t.xpath('td/text()')[5]
            if clas == 'HTTPS':
                ip = t.xpath('td/text()')[0]  # 获取代理ip
                port = t.xpath('td/text()')[1]  # 获取端口
                proxies.append(ip + ':' + port)
        return proxies

def write_comment(comment):
    comment += '\n'
    with open('comment.txt', 'a', encoding='utf-8') as f:
        f.write(comment.replace('回复', '').replace('等人', '').replace('图片评论', ''))

def download_pic(url, nick_name):
    if not url:
        return
    if not os.path.exists(pic_file_path):
        os.mkdir(pic_file_path)
    resp = requests.get(url)
    if resp.status_code == 200:
        with open(pic_file_path + f'/{nick_name}.jpg', 'wb') as f:
            f.write(resp.content)


if __name__ == '__main__':
    proxies_use_urls = set()
    proxies_nouse_urls = set()
    # 代理池
    proxies = ipget()
    params = {
        'ajwvr': 6,
        'id': '4492118670464947',
        'from': 'singleWeiBo',
        'root_comment_max_id': ''
    }
    URL = 'https://weibo.com/aj/v6/comment/big'
    # 爬去100页，需要代理，或者进行sleep 不然会超时。
    for num in range(3):
        print(f'=========   正在读取第 {num} 页 ====================')
        while True:
            try:
                proxy = proxies.pop()
                print(proxy)

                if proxy not in proxies_use_urls and proxy not in proxies_nouse_urls :
                    resp = requests.get(URL, params=params, headers=headers, proxies={"https":proxy})
                    resp = json.loads(resp.text)
            except Exception as e:
                proxies_nouse_urls.add(proxy)
                continue
            break
        if resp['code'] == '100000':
            print(proxies_nouse_urls)
            proxies_use_urls.add(proxy)
            html = resp['data']['html']
            html = etree.HTML(html)
            print(etree.tostring(html, encoding=str, pretty_print=True))
            max_id_json = html.xpath('//div[@node-type="comment_loading"]/@action-data')[0]
            node_params = parse_qs(max_id_json)
            # 获取父级ID
            max_id = node_params['root_comment_max_id'][0]
            params['root_comment_max_id'] = max_id
            data = html.xpath('//div[@node-type="root_comment"]')

            for i in data:
                # 评论人昵称
                nick_name = i.xpath('.//div[@class="WB_text"]/a/text()')[0]
                # 评论内容。
                wb_text = i.xpath('.//div[@class="WB_text"][1]/text()')
                string = ''.join(wb_text).strip().replace('\n', '')
                write_comment(string)
                # 评论id , 用于获取评论内容
                comment_id = i.xpath('./@comment_id')[0]
                # 评论的图片地址
                pic_url = i.xpath('.//li[@class="WB_pic S_bg2 bigcursor"]/img/@src')
                pic_url = 'https:' + pic_url[0] if pic_url else ''
                download_pic(pic_url, nick_name)

转载：https://blog.csdn.net/weixin_43837855/article/details/106802055

查看评论

飞道的博客

飞道的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

爬取肖战微博评论

* 以上用户言论只代表其个人观点，不代表本网站的观点或立场