斗鱼直播
# coding=utf-8
'''
爬取斗鱼直播房间名和人气值
'''
# 导入selenium工具
import time
from selenium import webdriver
from lxml import etree
class Douyu(object):
# 初始化
def __init__(self):
# 通过浏览器加载网页
self.driver = webdriver.PhantomJS()
# 要统计的数量
self.room_count = 0 # 房间数量
self.hot_count = 0 # 热度
# 获取房间名和人气
def run(self):
# 打开网页
# self.driver.get('https://www.douyu.com/directory/all')
# 爬取相关的内容
content = etree.HTML(self.driver.page_source)
rooms = content.xpath('//li[@class="layout-Cover-item"]/div[@class="DyListCover HeaderCell is-href"]/a/div[@class="DyListCover-content"]')
for room in rooms:
# 获取房间名称
tmp = room.xpath('./div[@class="DyListCover-info"]/h3[@class="DyListCover-intro"]/text()')
roomname = tmp[0]
# 获取人气
tmp = room.xpath('./div[@class="DyListCover-info"]/span[@class="DyListCover-hot"]/text()')
hot = tmp[0]
print('人气:'+hot+';房间:'+roomname)
# 增加房间数量
self.room_count += 1
# 增加人气数量
if hot[-1] == '万':
hot = hot[:-1]
hot = int(float(hot) * 10000)
self.hot_count += hot
else:
hot = int(hot)
self.hot_count += hot
# 输出结果
print('当前直播房间总量:',self.room_count)
print('当前人气总数:',self.hot_count)
# 遍历页数
def test(self):
# 打开网页
self.driver.get('https://www.douyu.com/directory/all')
# 循环遍历每一页
page = 0
while True:
# 延迟一点
time.sleep(5)
page += 1
# 尝试查找laypage_next
ret = self.driver.find_element_by_class_name('dy-Pagination-next').get_attribute("aria-disabled")
if ret.lower() == 'false':
print('-'*30+'第' + str(page) + '页'+'-'*30)
self.run()
else:
print('-'*30+'最后一页'+'-'*30)
break
self.driver.find_element_by_class_name('dy-Pagination-next').click()
if __name__ == '__main__':
dy = Douyu()
dy.test()
虎牙直播
# coding=utf-8
'''
爬取虎牙直播房间名和人气值
'''
# 导入selenium工具
import time
from selenium import webdriver
from lxml import etree
class Huya(object):
# 初始化
def __init__(self):
# 通过浏览器加载网页
self.driver = webdriver.PhantomJS()
# 要统计的数量
self.room_count = 0 # 房间数量
self.hot_count = 0 # 热度
# self.rooms_count = 0
# self.hots_count = 0
# 获取房间名和人气
def run(self):
# 打开网页
# self.driver.get('https://www.huya.com/l')
# 爬取相关的内容
content = etree.HTML(self.driver.page_source)
rooms = content.xpath('//li[@class="game-live-item"]')
for room in rooms:
# 获取房间名称
tmp = room.xpath('./a[@class="title new-clickstat"]/text()')
roomname = tmp[0]
# 获取人气
tmp = room.xpath('./span[@class="txt"]/span[@class="num"]/i[@class="js-num"]/text()')
hot = tmp[0]
print('房间:'+roomname+'; 人气:'+str(hot))
# 增加房间数量
self.room_count += 1
# 增加人气数量
if hot[-1] == '万':
hot = hot[:-1]
hot = int(float(hot) * 10000)
self.hot_count += hot
else:
hot = int(hot)
self.hot_count += hot
# 输出结果
print('当前直播房间总量:',self.room_count)
print('当前人气总数:',self.hot_count)
# 遍历页数
def test(self):
# 打开网页
self.driver.get('https://www.huya.com/l')
# 循环遍历每一页
page = 0
while True:
# 延迟一点
time.sleep(5)
page += 1
# 尝试查找laypage_next
ret = self.driver.page_source.find('laypage_next')
if ret >= 0:
print('-'*30+'第' + str(page) + '页'+'-'*30)
self.run()
else:
print('-'*'最后一页'+'-'*30)
break
self.driver.find_element_by_class_name('laypage_next').click()
if __name__ == '__main__':
huya = Huya()
huya.test()
总结
- xpath 要填写正确
- 每个网站的翻页方式不同
- 灵活使用 find_element_by_class_name 方法以及 get_attribute 方法
转载:https://blog.csdn.net/weixin_43595176/article/details/102411605
查看评论