小言_互联网的博客

Web爬虫————微博热点爬取

429人阅读  评论(0)

  
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urlencode
  4. import time
  5. import pymongo
  6. base_url = 'https://weibo.com/a/aj/transform/loadingmoreunlogin?'
  7. ''' https://weibo.com/a/aj/transform/loadingmoreunlogin?
  8. ajwvr=6&category=0&page=2&lefnav=0&cursor=&__rnd=1583127847996
  9. ajwvr=6&category=0&page=3&lefnav=0&cursor=&__rnd=1583128195155
  10. ajwvr=6&category=0&page=4&lefnav=0&cursor=&__rnd=1583128249888
  11. ajwvr=6&category=0&page=5&lefnav=0&cursor=&__rnd=1583128278698
  12. ajwvr=6&category=0&page=6&lefnav=0&cursor=&__rnd=1583128283196
  13. '''
  14. headers = {
  15. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
  16. 'Refer': 'https://weibo.com/',
  17. 'Host': 'weibo.com',
  18. 'X-Requested-With': 'XMLHttpRequest'
  19. }
  20. get_now = lambda : int(time.time() * 1000)
  21. def get_page(page):
  22. params = {
  23. 'ajwvr' : '6',
  24. 'category' : '0',
  25. 'page' : page,
  26. 'lefnav' : '0',
  27. 'cursor' : '',
  28. '__rnd' : get_now()
  29. }
  30. url = base_url + urlencode(params)
  31. try:
  32. response = requests.get(url, headers= headers)
  33. if response.status_code == 200:
  34. return response.text
  35. except requests.ConnectionError as e:
  36. print( 'Error', e.args)
  37. cilent = pymongo.MongoClient( 'mongodb://localhost:27017/')
  38. db = cilent.test
  39. collections = db.weibos
  40. def parse_page(str):
  41. # 解析ajax返回的内容
  42. html = BeautifulSoup(str, 'lxml')
  43. items = html.find_all( 'div', class_ = 'list_des')
  44. for item in items:
  45. title = item.find( 'h3',class_ = 'list_title_s').text
  46. author = item.find_all( 'span', class_ = 'subinfo S_txt2')[ 0].text
  47. time = item.find_all( 'span', class_ = 'subinfo S_txt2')[ 1].text
  48. nums = item.find_all( 'em')
  49. disparate = nums[ 1].text
  50. cimments = nums[ 3].text
  51. great = nums[ 5].text
  52. weibo = {
  53. '题目': title,
  54. '作者': author,
  55. '时间': time,
  56. '转发': disparate,
  57. '评论': cimments,
  58. '点赞': great
  59. }
  60. collections.insert_one(weibo)
  61. def main():
  62. for i in range( 1, 50):
  63. str = get_page(i)
  64. # print(str)
  65. str = str.encode( 'utf8').decode( 'unicode_escape') # 将获取的网页unicode码重新编码,并解码
  66. str = str.replace( '\\/', '/')
  67. parse_page(str)
  68. time.sleep( 0.5)
  69. main()

 


转载:https://blog.csdn.net/qq_3302860184/article/details/104614869
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场