小言_互联网的博客

python爬虫(五):csdn博客访问量如何增加(增加ip代理)

650人阅读  评论(0)

针对爬虫首先声明只是哥玩具爬虫,得到自己的所有博客地址,然后随机访问,最后增加TamperMonkey插件

python爬虫

思想很简单,包含了2个类IPSpyder和CSDN类,前者保证一周内get一次ip代理到本地,后者包含3个方法负责随机读取博客,getBlogList()方法的输入是个人博客的主页地址,输出是个人博客所有的链接,getBlogTitleAndCount()的输入时单个博客的url地址,拿到当前博客的访问量和标题,输出;

后续优化:

  •  增加tdqm的进度条显示;
  • 考虑多线程方式

IP代理的爬虫参考:爬取IP代理


  
  1. import requests
  2. import lxml
  3. from bs4 import BeautifulSoup
  4. import os
  5. import string
  6. import random
  7. import time
  8. import aiohttp
  9. import asyncio
  10. from tqdm import tqdm
  11. import os
  12. import datetime
  13. class IPSpyder(object):
  14. def __init__(self):
  15. self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  16. 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
  17. self.sixsix_url_range = 35
  18. self.kaixin_url_range = 2
  19. self.kuai_url_range = 2
  20. self.ip_list_all = []
  21. self.ip_ok_list_all = []
  22. self.url = 'https://blog.csdn.net/yezonggang/article/details/112991188'
  23. self.ip_avaliable_file = 'F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt'
  24. def get_html(self, url, flag):
  25. try:
  26. headers = self.headers
  27. response = requests.get(url, headers=headers)
  28. response.raise_for_status()
  29. if flag:
  30. response.encoding = 'utf-8'
  31. else:
  32. response.encoding = 'gb2312'
  33. return response.text
  34. except Exception as err:
  35. return '请求异常'
  36. def get_66ip(self):
  37. #ip_list = []
  38. for index in range( 1, self.sixsix_url_range):
  39. count = 0
  40. province = ''
  41. url = 'http://www.66ip.cn/areaindex_{}/1.html'.format(index)
  42. html = self.get_html(url, flag= False)
  43. soup = BeautifulSoup(html, 'lxml')
  44. tr_list = soup.find_all(name= 'tr')
  45. for tr_ in tr_list[ 2:]:
  46. td_list = tr_.find_all(name= 'td')
  47. ip = td_list[ 0].string
  48. port = td_list[ 1].string
  49. province = td_list[ 2].string
  50. ip_port = ip + ':' + port
  51. self.ip_list_all.append(ip_port)
  52. count += 1
  53. print( 'Saved {0} {1} ip.'.format(province, count))
  54. # 速度不要太快哦!, 否则获取不到页面内容
  55. time.sleep( 3)
  56. print( '66 daili Finished!!!')
  57. def get_kaixinip(self):
  58. #ip_list = []
  59. for index in range( 1, self.kaixin_url_range):
  60. count = 0
  61. url = 'http://www.kxdaili.com/dailiip/1/{}.html'.format(index)
  62. html = self.get_html(url, False)
  63. soup = BeautifulSoup(html, 'lxml')
  64. tr_list = soup.find_all(name= 'tr')
  65. for tr_ in tr_list[ 2:]:
  66. td_list = tr_.find_all(name= 'td')
  67. ip = td_list[ 0].string
  68. port = td_list[ 1].string
  69. ip_port = ip + ':' + port
  70. self.ip_list_all.append(ip_port)
  71. count += 1
  72. print( 'Saved {0} page {1} ip.'.format(index, count))
  73. # 速度不要太快哦!, 否则获取不到页面内容
  74. time.sleep( 3)
  75. print( 'kaixindaili Finished!!!')
  76. def get_goubanjiaip(self):
  77. #ip_list = []
  78. url = 'http://www.goubanjia.com/'
  79. html = self.get_html(url, False)
  80. soup = BeautifulSoup(html, 'lxml')
  81. td_list = soup.find_all(class_= 'ip')
  82. for td_ in td_list:
  83. ip_ = ''
  84. for child in td_.children:
  85. if child == ':':
  86. ip_ += child
  87. elif not child.attrs:
  88. ip_ += child.get_text()
  89. elif list(child.attrs.keys())[ 0] == 'class':
  90. ip_ = ip_ + child.get_text()
  91. elif child.attrs[ 'style'] == 'display:inline-block;' or child.attrs[ 'style'] == 'display: inline-block;':
  92. ip_ += child.get_text()
  93. self.ip_list_all.append(ip_)
  94. print( 'quanwang daili Finished!!!')
  95. # 快代理
  96. def get_kuaidaili(self):
  97. #ip_list = []
  98. for index in range( 1, self.kuai_url_range):
  99. count = 0
  100. url = 'https://www.kuaidaili.com/free/inha/{}/'.format(index)
  101. html = self.get_html(url, False)
  102. soup = BeautifulSoup(html, 'lxml')
  103. tr_list = soup.find_all(name= 'tr')
  104. for tr_ in tr_list[ 1:]:
  105. td_list = tr_.find_all(name= 'td')
  106. ip = td_list[ 0].string
  107. port = td_list[ 1].string
  108. ip_port = ip + ':' + port
  109. self.ip_list_all.append(ip_port)
  110. count += 1
  111. print( 'Saved {0} page {1} ip.'.format(index, count))
  112. # 速度不要太快哦!, 否则获取不到页面内容
  113. time.sleep( 3)
  114. print( 'kuaidaili Finished!!!')
  115. async def test_ip(self, ip_, url):
  116. #global ip_ok
  117. conn = aiohttp.TCPConnector(verify_ssl= False)
  118. async with aiohttp.ClientSession(connector=conn) as session:
  119. try:
  120. proxy_ip = 'http://' + ip_
  121. print( '正在测试: ' + proxy_ip)
  122. async with session.get(url=url, headers=self.headers, proxy=proxy_ip, timeout= 15) as response:
  123. if response.status == 200:
  124. print( '代理可用: ' + ip_)
  125. self.ip_ok_list_all.append(ip_)
  126. else:
  127. print( '请求响应码不合法 ' + ip_)
  128. except:
  129. print( '代理请求失败', ip_)
  130. def run_test_ip_write_to_file(self):
  131. #csdn 点赞关注私聊发^-^
  132. print( 'csdn 点赞关注私聊发')
  133. # 我的博客列表,后面要跟翻页list/1
  134. # 我的博客列表有几页?
  135. # header
  136. # 定义一个类 CSDN
  137. # csdn_url='https://blog.csdn.net/yezonggang/article/details/106344148'
  138. class CSDN(object):
  139. # 类的静态变量
  140. def __init__(self):
  141. self.my_csdn = 'https://blog.csdn.net/yezonggang/article/list/'
  142. self.my_list = 5
  143. self.csdn_url = ''
  144. self.proxies = [{ 'http': 'socks5://183.195.106.118:8118'}]
  145. self.blogList = []
  146. self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  147. 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
  148. }
  149. # 从博客首页进去,遍历得到我的博客列表,把博客地址塞进self.blogList[]
  150. def getBlogList(self):
  151. i = 1
  152. print( '-------------------------------begin----------------------------')
  153. while(i <= self.my_list):
  154. response = requests.get(self.my_csdn+str(i), headers=self.headers)
  155. response.enconding = 'utf-8'
  156. conent = response.content.decode( 'utf-8')
  157. soup = BeautifulSoup(conent, 'lxml')
  158. a_tag_content = soup.findAll( 'a')
  159. for a_tag in a_tag_content:
  160. a_tag_content = str(a_tag.get( 'href'))
  161. if( 'details' in a_tag_content and 'comments' not in a_tag_content):
  162. self.blogList.append(a_tag_content)
  163. #print (a_tag_content)
  164. print( 'Success, already append ' +
  165. str(len(self.blogList)) + ' to the blogList!')
  166. i = i+ 1
  167. # print (self.blogList)
  168. # 随机遍历self.blogList[]里面的博客链接,得到博客的标题和次数,并输出
  169. def getBlogTitleAndCount(self, proxy):
  170. proxy_support = {
  171. 'http': 'http://'+proxy,
  172. 'https': 'https://'+proxy,
  173. }
  174. response = requests.get(
  175. self.csdn_url, headers=self.headers, proxies=proxy_support)
  176. response.enconding = 'utf-8'
  177. conent = response.content.decode( 'utf-8')
  178. soup = BeautifulSoup(conent, 'lxml')
  179. # 得到当前博客的标题:数据挖掘算法和实践(二十一):kaggle经典-职场离职率分析案例解读
  180. blog_title = soup.title.string
  181. # 得到当前博客的访问量统计值,显示出来
  182. blog_counts = soup.find_all( 'span')
  183. for blog_count in blog_counts:
  184. blog_count_single_class = blog_count.get( 'class')
  185. if(blog_count_single_class is not None and blog_count_single_class[ 0] == 'read-count'):
  186. blog_count_now = blog_count.string
  187. print( '当前读取的博客地址是:【'+self.csdn_url+ '】\n' +
  188. '当前读取的博客地址是:【'+blog_title + '】\n' +
  189. '当前使用的代理IP是:【'+proxy + '】\n' +
  190. '当前博客的阅读统计是:【_' + blog_count_now + '_次】')
  191. def beginTO(self, proxy):
  192. self.getBlogList()
  193. self.csdn_url = random.choice(self.blogList)
  194. self.getBlogTitleAndCount(proxy)
  195. #random_time=random.uniform(sleepTimeMin, sleepTimeMax)
  196. #print("Begin to sleep now,Sleep time: "+str(random_time))
  197. # time.sleep(random_time)
  198. self.blogList = []
  199. # 逻辑开始,首先判定本地的可用ip文件的创建戳是不是超过1周或者文件是空,若是就重新刷新,不然直接开始刷;
  200. ip_avaliable = "F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt"
  201. mtime = os.stat(ip_avaliable).st_ctime
  202. # 如果文件存在,并且创建时间是7天内,并且非空
  203. if( not os.path.exists(ip_avaliable) or ((time.time()-mtime)/( 3600* 24) > 7) or not os.path.getsize(ip_avaliable)):
  204. # 先刷代理后刷博客
  205. ipSpyder = IPSpyder()
  206. ipSpyder.get_66ip()
  207. #ipSpyder.get_kaixinip()
  208. #ipSpyder.get_goubanjiaip()
  209. #ipSpyder.get_kuaidaili()
  210. ipSpyder.run_test_ip_write_to_file()
  211. # 直接调用开始刷
  212. file_ip = open(ip_avaliable, 'r')
  213. ip_avaliable_list = file_ip.read().split( ",")
  214. file_ip.close()
  215. # print(ip_avaliable_list)
  216. proxy_now = random.choice(ip_avaliable_list)
  217. csdn = CSDN()
  218. while True:
  219. print( 'csdn 点赞关注私聊发')
  220. csdn.beginTO(proxy_now)
  221. time.sleep( 10)
  222. #csdn 点赞关注私聊发^-^
  223. #ipSpyder =IPSpyder()
  224. # ipSpyder.get_66ip()
  225. # ipSpyder.get_kaixinip()
  226. # ipSpyder.get_goubanjiaip()
  227. # ipSpyder.get_kuaidaili()
  228. # ipSpyder.run_test_ip()
  229. #
  230. # time.localtime(statinfo)
  231. #print ('得到了一系列的IP代理,总共有 '+str(len(ipSpyder.ip_list_all))+' 个;')
  232. #print ('经过测试总共有 '+str(len(ipSpyder.ip_ok_list_all))+' 个IP代理可用;')
  233. #file = open("ip_avaliable.txt", 'w')
  234. # file.write(ip_ok_list_all)
  235. # file.close()

输出的范例如下:


  
  1. Success, already append 48 to the blogList!
  2. Success, already append 96 to the blogList!
  3. Success, already append 144 to the blogList!
  4. Success, already append 192 to the blogList!
  5. Success, already append 211 to the blogList!
  6. 当前读取的博客地址是:【https://blog.csdn.net/yezonggang/article/details/ 105723456
  7. 当前读取的博客地址是:【数据挖掘算法和实践(一):线性回归和逻辑回归(house_price数据集)_叶子叶来-CSDN博客】
  8. 当前使用的代理IP是:【 211.144 .213 .145: 80
  9. 当前博客的阅读统计是:【_351_次】

 


转载:https://blog.csdn.net/yezonggang/article/details/112991188
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场