-
import requests
-
from bs4
import BeautifulSoup
-
from urllib.parse
import urlencode
-
import time
-
import pymongo
-
-
base_url =
'https://weibo.com/a/aj/transform/loadingmoreunlogin?'
-
''' https://weibo.com/a/aj/transform/loadingmoreunlogin?
-
ajwvr=6&category=0&page=2&lefnav=0&cursor=&__rnd=1583127847996
-
ajwvr=6&category=0&page=3&lefnav=0&cursor=&__rnd=1583128195155
-
ajwvr=6&category=0&page=4&lefnav=0&cursor=&__rnd=1583128249888
-
ajwvr=6&category=0&page=5&lefnav=0&cursor=&__rnd=1583128278698
-
ajwvr=6&category=0&page=6&lefnav=0&cursor=&__rnd=1583128283196
-
'''
-
headers = {
-
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
-
'Refer':
'https://weibo.com/',
-
'Host':
'weibo.com',
-
'X-Requested-With':
'XMLHttpRequest'
-
}
-
get_now =
lambda : int(time.time() *
1000)
-
def get_page(page):
-
params = {
-
'ajwvr' :
'6',
-
'category' :
'0',
-
'page' : page,
-
'lefnav' :
'0',
-
'cursor' :
'',
-
'__rnd' : get_now()
-
}
-
url = base_url + urlencode(params)
-
try:
-
response = requests.get(url, headers= headers)
-
if response.status_code ==
200:
-
return response.text
-
except requests.ConnectionError
as e:
-
print(
'Error', e.args)
-
-
-
cilent = pymongo.MongoClient(
'mongodb://localhost:27017/')
-
db = cilent.test
-
collections = db.weibos
-
-
-
def parse_page(str):
-
# 解析ajax返回的内容
-
html = BeautifulSoup(str,
'lxml')
-
items = html.find_all(
'div', class_ =
'list_des')
-
for item
in items:
-
title = item.find(
'h3',class_ =
'list_title_s').text
-
author = item.find_all(
'span', class_ =
'subinfo S_txt2')[
0].text
-
time = item.find_all(
'span', class_ =
'subinfo S_txt2')[
1].text
-
nums = item.find_all(
'em')
-
disparate = nums[
1].text
-
cimments = nums[
3].text
-
great = nums[
5].text
-
weibo = {
-
'题目': title,
-
'作者': author,
-
'时间': time,
-
'转发': disparate,
-
'评论': cimments,
-
'点赞': great
-
}
-
collections.insert_one(weibo)
-
-
def main():
-
for i
in range(
1,
50):
-
str = get_page(i)
-
# print(str)
-
str = str.encode(
'utf8').decode(
'unicode_escape')
# 将获取的网页unicode码重新编码,并解码
-
str = str.replace(
'\\/',
'/')
-
parse_page(str)
-
time.sleep(
0.5)
-
-
main()
转载:https://blog.csdn.net/qq_3302860184/article/details/104614869
查看评论