python爬虫（五）：提高csdn博客访问量（ip proxy）_小言_互联网的博客

python爬虫（五）：提高csdn博客访问量（ip proxy）

2021-01-29 07:03 614人阅读评论(0)

上面一篇被ban了，重写，针对爬虫首先声明只是哥玩具爬虫，得到自己的所有博客地址，然后随机访问；

思想很简单，包含了2个类IPSpyder和CSDN类，前者保证一周内get一次ip代理到本地，后者包含3个方法负责随机读取博客，getBlogList()方法的输入是个人博客的主页地址，输出是个人博客所有的链接，getBlogTitleAndCount()的输入时单个博客的url地址，拿到当前博客的访问量和标题，输出；

IP代理的爬虫参考：爬取IP代理


  
   
    
     
    
    
     
      import requests
     
    
   
    
     
    
    
     
      import lxml
     
    
   
    
     
    
    
     
      from bs4 
      import BeautifulSoup
     
    
   
    
     
    
    
     
      import os
     
    
   
    
     
    
    
     
      import string
     
    
   
    
     
    
    
     
      import random
     
    
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      import aiohttp
     
    
   
    
     
    
    
     
      import asyncio
     
    
   
    
     
    
    
     
      from tqdm 
      import tqdm
     
    
   
    
     
    
    
     
      import os
     
    
   
    
     
    
    
     
      import datetime
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      class IPSpyder(object):
     
    
   
    
     
    
    
         
      def __init__(self):
     
    
   
    
     
    
    
     
              self.headers = {
      'User-Agent': 
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
     
    
   
    
     
    
    
                             
      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
     
    
   
    
     
    
    
     
              self.sixsix_url_range = 
      35
     
    
   
    
     
    
    
     
              self.kaixin_url_range = 
      2
     
    
   
    
     
    
    
     
              self.kuai_url_range = 
      2
     
    
   
    
     
    
    
     
              self.ip_list_all = []
     
    
   
    
     
    
    
     
              self.ip_ok_list_all = []
     
    
   
    
     
    
    
     
              self.url = 
      'https://blog.csdn.net/yezonggang/article/details/112991188'
     
    
   
    
     
    
    
     
              self.ip_avaliable_file = 
      'F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt'
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      def get_html(self, url, flag):
     
    
   
    
     
    
    
             
      try:
     
    
   
    
     
    
    
     
                  headers = self.headers
     
    
   
    
     
    
    
     
                  response = requests.get(url, headers=headers)
     
    
   
    
     
    
    
     
                  response.raise_for_status()
     
    
   
    
     
    
    
                 
      if flag:
     
    
   
    
     
    
    
     
                      response.encoding = 
      'utf-8'
     
    
   
    
     
    
    
                 
      else:
     
    
   
    
     
    
    
     
                      response.encoding = 
      'gb2312'
     
    
   
    
     
    
    
                 
      return response.text
     
    
   
    
     
    
    
             
      except Exception 
      as err:
     
    
   
    
     
    
    
                 
      return 
      '请求异常'
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      def get_66ip(self):
     
    
   
    
     
    
    
             
      #ip_list = []
     
    
   
    
     
    
    
             
      for index 
      in range(
      1, self.sixsix_url_range):
     
    
   
    
     
    
    
     
                  count = 
      0
     
    
   
    
     
    
    
     
                  province = 
      ''
     
    
   
    
     
    
    
     
                  url = 
      'http://www.66ip.cn/areaindex_{}/1.html'.format(index)
     
    
   
    
     
    
    
     
                  html = self.get_html(url, flag=
      False)
     
    
   
    
     
    
    
     
                  soup = BeautifulSoup(html, 
      'lxml')
     
    
   
    
     
    
    
     
                  tr_list = soup.find_all(name=
      'tr')
     
    
   
    
     
    
    
                 
      for tr_ 
      in tr_list[
      2:]:
     
    
   
    
     
    
    
     
                      td_list = tr_.find_all(name=
      'td')
     
    
   
    
     
    
    
     
                      ip = td_list[
      0].string
     
    
   
    
     
    
    
     
                      port = td_list[
      1].string
     
    
   
    
     
    
    
     
                      province = td_list[
      2].string
     
    
   
    
     
    
    
     
                      ip_port = ip + 
      ':' + port
     
    
   
    
     
    
    
     
                      self.ip_list_all.append(ip_port)
     
    
   
    
     
    
    
     
                      count += 
      1
     
    
   
    
     
    
    
     
                  print(
      'Saved {0} {1} ip.'.format(province, count))
     
    
   
    
     
    
    
                 
      # 速度不要太快哦!, 否则获取不到页面内容
     
    
   
    
     
    
    
     
                  time.sleep(
      3)
     
    
   
    
     
    
    
     
              print(
      '66 daili Finished!!!')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      def get_kaixinip(self):
     
    
   
    
     
    
    
             
      #ip_list = []
     
    
   
    
     
    
    
             
      for index 
      in range(
      1, self.kaixin_url_range):
     
    
   
    
     
    
    
     
                  count = 
      0
     
    
   
    
     
    
    
     
                  url = 
      'http://www.kxdaili.com/dailiip/1/{}.html'.format(index)
     
    
   
    
     
    
    
     
                  html = self.get_html(url, 
      False)
     
    
   
    
     
    
    
     
                  soup = BeautifulSoup(html, 
      'lxml')
     
    
   
    
     
    
    
     
                  tr_list = soup.find_all(name=
      'tr')
     
    
   
    
     
    
    
                 
      for tr_ 
      in tr_list[
      2:]:
     
    
   
    
     
    
    
     
                      td_list = tr_.find_all(name=
      'td')
     
    
   
    
     
    
    
     
                      ip = td_list[
      0].string
     
    
   
    
     
    
    
     
                      port = td_list[
      1].string
     
    
   
    
     
    
    
     
                      ip_port = ip + 
      ':' + port
     
    
   
    
     
    
    
     
                      self.ip_list_all.append(ip_port)
     
    
   
    
     
    
    
     
                      count += 
      1
     
    
   
    
     
    
    
     
                  print(
      'Saved {0} page {1} ip.'.format(index, count))
     
    
   
    
     
    
    
                 
      # 速度不要太快哦!, 否则获取不到页面内容
     
    
   
    
     
    
    
     
                  time.sleep(
      3)
     
    
   
    
     
    
    
     
              print(
      'kaixindaili Finished!!!')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      def get_goubanjiaip(self):
     
    
   
    
     
    
    
             
      #ip_list = []
     
    
   
    
     
    
    
     
              url = 
      'http://www.goubanjia.com/'
     
    
   
    
     
    
    
     
              html = self.get_html(url, 
      False)
     
    
   
    
     
    
    
     
              soup = BeautifulSoup(html, 
      'lxml')
     
    
   
    
     
    
    
     
              td_list = soup.find_all(class_=
      'ip')
     
    
   
    
     
    
    
             
      for td_ 
      in td_list:
     
    
   
    
     
    
    
     
                  ip_ = 
      ''
     
    
   
    
     
    
    
                 
      for child 
      in td_.children:
     
    
   
    
     
    
    
                     
      if child == 
      ':':
     
    
   
    
     
    
    
     
                          ip_ += child
     
    
   
    
     
    
    
                     
      elif 
      not child.attrs:
     
    
   
    
     
    
    
     
                          ip_ += child.get_text()
     
    
   
    
     
    
    
                     
      elif list(child.attrs.keys())[
      0] == 
      'class':
     
    
   
    
     
    
    
     
                          ip_ = ip_ + child.get_text()
     
    
   
    
     
    
    
                     
      elif child.attrs[
      'style'] == 
      'display:inline-block;' 
      or child.attrs[
      'style'] == 
      'display: inline-block;':
     
    
   
    
     
    
    
     
                          ip_ += child.get_text()
     
    
   
    
     
    
    
     
                  self.ip_list_all.append(ip_)
     
    
   
    
     
    
    
     
              print(
      'quanwang daili Finished!!!')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      # 快代理
     
    
   
    
     
    
    
         
      def get_kuaidaili(self):
     
    
   
    
     
    
    
             
      #ip_list = []
     
    
   
    
     
    
    
             
      for index 
      in range(
      1, self.kuai_url_range):
     
    
   
    
     
    
    
     
                  count = 
      0
     
    
   
    
     
    
    
     
                  url = 
      'https://www.kuaidaili.com/free/inha/{}/'.format(index)
     
    
   
    
     
    
    
     
                  html = self.get_html(url, 
      False)
     
    
   
    
     
    
    
     
                  soup = BeautifulSoup(html, 
      'lxml')
     
    
   
    
     
    
    
     
                  tr_list = soup.find_all(name=
      'tr')
     
    
   
    
     
    
    
                 
      for tr_ 
      in tr_list[
      1:]:
     
    
   
    
     
    
    
     
                      td_list = tr_.find_all(name=
      'td')
     
    
   
    
     
    
    
     
                      ip = td_list[
      0].string
     
    
   
    
     
    
    
     
                      port = td_list[
      1].string
     
    
   
    
     
    
    
     
                      ip_port = ip + 
      ':' + port
     
    
   
    
     
    
    
     
                      self.ip_list_all.append(ip_port)
     
    
   
    
     
    
    
     
                      count += 
      1
     
    
   
    
     
    
    
     
                  print(
      'Saved {0} page {1} ip.'.format(index, count))
     
    
   
    
     
    
    
                 
      # 速度不要太快哦!, 否则获取不到页面内容
     
    
   
    
     
    
    
     
                  time.sleep(
      3)
     
    
   
    
     
    
    
     
              print(
      'kuaidaili Finished!!!')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      async 
      def test_ip(self, ip_, url):
     
    
   
    
     
    
    
             
      #global ip_ok
     
    
   
    
     
    
    
     
              conn = aiohttp.TCPConnector(verify_ssl=
      False)
     
    
   
    
     
    
    
             
      async 
      with aiohttp.ClientSession(connector=conn) 
      as session:
     
    
   
    
     
    
    
                 
      try:
     
    
   
    
     
    
    
     
                      proxy_ip = 
      'http://' + ip_
     
    
   
    
     
    
    
     
                      print(
      '正在测试: ' + proxy_ip)
     
    
   
    
     
    
    
                     
      async 
      with session.get(url=url, headers=self.headers, proxy=proxy_ip, timeout=
      15) 
      as response:
     
    
   
    
     
    
    
                         
      if response.status == 
      200:
     
    
   
    
     
    
    
     
                              print(
      '代理可用: ' + ip_)
     
    
   
    
     
    
    
     
                              self.ip_ok_list_all.append(ip_)
     
    
   
    
     
    
    
                         
      else:
     
    
   
    
     
    
    
     
                              print(
      '请求响应码不合法 ' + ip_)
     
    
   
    
     
    
    
                 
      except:
     
    
   
    
     
    
    
     
                      print(
      '代理请求失败', ip_)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      def run_test_ip_write_to_file(self):
     
    
   
    
     
    
    
             
      #csdn 点赞关注私聊发^-^
     
    
   
    
     
    
    
     
              print(
      'csdn 点赞关注私聊发')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 我的博客列表,后面要跟翻页list/1
     
    
   
    
     
    
    
     
      # 我的博客列表有几页？
     
    
   
    
     
    
    
     
      # header
     
    
   
    
     
    
    
     
      # 定义一个类 CSDN
     
    
   
    
     
    
    
     
      # csdn_url='https://blog.csdn.net/yezonggang/article/details/106344148'
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      class CSDN(object):
     
    
   
    
     
    
    
         
      # 类的静态变量
     
    
   
    
     
    
    
         
      def __init__(self):
     
    
   
    
     
    
    
     
              self.my_csdn = 
      'https://blog.csdn.net/yezonggang/article/list/'
     
    
   
    
     
    
    
     
              self.my_list = 
      5
     
    
   
    
     
    
    
     
              self.csdn_url = 
      ''
     
    
   
    
     
    
    
     
              self.proxies = [{
      'http': 
      'socks5://183.195.106.118:8118'}]
     
    
   
    
     
    
    
     
              self.blogList = []
     
    
   
    
     
    
    
     
              self.headers = {
      'User-Agent': 
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
     
    
   
    
     
    
    
                             
      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
     
    
   
    
     
    
    
     
                              }
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 从博客首页进去，遍历得到我的博客列表，把博客地址塞进self.blogList[]
     
    
   
    
     
    
    
         
      def getBlogList(self):
     
    
   
    
     
    
    
     
              i = 
      1
     
    
   
    
     
    
    
     
              print(
      '-------------------------------begin----------------------------')
     
    
   
    
     
    
    
             
      while(i <= self.my_list):
     
    
   
    
     
    
    
     
                  response = requests.get(self.my_csdn+str(i), headers=self.headers)
     
    
   
    
     
    
    
     
                  response.enconding = 
      'utf-8'
     
    
   
    
     
    
    
     
                  conent = response.content.decode(
      'utf-8')
     
    
   
    
     
    
    
     
                  soup = BeautifulSoup(conent, 
      'lxml')
     
    
   
    
     
    
    
     
                  a_tag_content = soup.findAll(
      'a')
     
    
   
    
     
    
    
                 
      for a_tag 
      in a_tag_content:
     
    
   
    
     
    
    
     
                      a_tag_content = str(a_tag.get(
      'href'))
     
    
   
    
     
    
    
                     
      if(
      'details' 
      in a_tag_content 
      and 
      'comments' 
      not 
      in a_tag_content):
     
    
   
    
     
    
    
     
                          self.blogList.append(a_tag_content)
     
    
   
    
     
    
    
                         
      #print (a_tag_content)
     
    
   
    
     
    
    
     
                  print(
      'Success, already append ' +
     
    
   
    
     
    
    
     
                        str(len(self.blogList)) + 
      ' to the blogList!')
     
    
   
    
     
    
    
     
                  i = i+
      1
     
    
   
    
     
    
    
             
      # print (self.blogList)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 随机遍历self.blogList[]里面的博客链接，得到博客的标题和次数，并输出
     
    
   
    
     
    
    
         
      def getBlogTitleAndCount(self, proxy):
     
    
   
    
     
    
    
     
              proxy_support = {
     
    
   
    
     
    
    
                 
      'http': 
      'http://'+proxy,
     
    
   
    
     
    
    
                 
      'https': 
      'https://'+proxy,
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
              response = requests.get(
     
    
   
    
     
    
    
     
                  self.csdn_url, headers=self.headers, proxies=proxy_support)
     
    
   
    
     
    
    
     
              response.enconding = 
      'utf-8'
     
    
   
    
     
    
    
     
              conent = response.content.decode(
      'utf-8')
     
    
   
    
     
    
    
     
              soup = BeautifulSoup(conent, 
      'lxml')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      # 得到当前博客的标题：数据挖掘算法和实践（二十一）：kaggle经典-职场离职率分析案例解读
     
    
   
    
     
    
    
     
              blog_title = soup.title.string
     
    
   
    
     
    
    
             
      # 得到当前博客的访问量统计值，显示出来
     
    
   
    
     
    
    
     
              blog_counts = soup.find_all(
      'span')
     
    
   
    
     
    
    
             
      for blog_count 
      in blog_counts:
     
    
   
    
     
    
    
     
                  blog_count_single_class = blog_count.get(
      'class')
     
    
   
    
     
    
    
                 
      if(blog_count_single_class 
      is 
      not 
      None 
      and blog_count_single_class[
      0] == 
      'read-count'):
     
    
   
    
     
    
    
     
                      blog_count_now = blog_count.string
     
    
   
    
     
    
    
     
              print(
      '当前读取的博客地址是：【'+self.csdn_url+
      '】\n' +
     
    
   
    
     
    
    
                   
      '当前读取的博客地址是：【'+blog_title + 
      '】\n' +
     
    
   
    
     
    
    
                   
      '当前使用的代理IP是：【'+proxy + 
      '】\n' +
     
    
   
    
     
    
    
                   
      '当前博客的阅读统计是:【_' + blog_count_now + 
      '_次】')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      def beginTO(self, proxy):
     
    
   
    
     
    
    
     
              self.getBlogList()
     
    
   
    
     
    
    
     
              self.csdn_url = random.choice(self.blogList)
     
    
   
    
     
    
    
     
              self.getBlogTitleAndCount(proxy)
     
    
   
    
     
    
    
             
      #random_time=random.uniform(sleepTimeMin, sleepTimeMax)
     
    
   
    
     
    
    
             
      #print("Begin to sleep now,Sleep time: "+str(random_time))
     
    
   
    
     
    
    
             
      # time.sleep(random_time)
     
    
   
    
     
    
    
     
              self.blogList = []
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 逻辑开始，首先判定本地的可用ip文件的创建戳是不是超过1周或者文件是空，若是就重新刷新，不然直接开始刷；
     
    
   
    
     
    
    
     
      ip_avaliable = 
      "F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt"
     
    
   
    
     
    
    
     
      mtime = os.stat(ip_avaliable).st_ctime
     
    
   
    
     
    
    
     
      # 如果文件存在，并且创建时间是7天内，并且非空
     
    
   
    
     
    
    
     
      if(
      not os.path.exists(ip_avaliable) 
      or ((time.time()-mtime)/(
      3600*
      24) > 
      7) 
      or 
      not os.path.getsize(ip_avaliable)):
     
    
   
    
     
    
    
         
      # 先刷代理后刷博客
     
    
   
    
     
    
    
     
          ipSpyder = IPSpyder()
     
    
   
    
     
    
    
     
          ipSpyder.get_66ip()
     
    
   
    
     
    
    
         
      #ipSpyder.get_kaixinip()
     
    
   
    
     
    
    
         
      #ipSpyder.get_goubanjiaip()
     
    
   
    
     
    
    
         
      #ipSpyder.get_kuaidaili()
     
    
   
    
     
    
    
     
          ipSpyder.run_test_ip_write_to_file()
     
    
   
    
     
    
    
     
      # 直接调用开始刷
     
    
   
    
     
    
    
     
      file_ip = open(ip_avaliable, 
      'r')
     
    
   
    
     
    
    
     
      ip_avaliable_list = file_ip.read().split(
      ",")
     
    
   
    
     
    
    
     
      file_ip.close()
     
    
   
    
     
    
    
     
      # print(ip_avaliable_list)
     
    
   
    
     
    
    
     
      proxy_now = random.choice(ip_avaliable_list)
     
    
   
    
     
    
    
     
      csdn = CSDN()
     
    
   
    
     
    
    
     
      while 
      True:
     
    
   
    
     
    
    
     
          print(
      'csdn 点赞关注私聊发')
     
    
   
    
     
    
    
     
          csdn.beginTO(proxy_now)
     
    
   
    
     
    
    
     
          time.sleep(
      10)
     
    
   
    
     
    
    
         
      #csdn 点赞关注私聊发^-^
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #ipSpyder =IPSpyder()
     
    
   
    
     
    
    
     
      # ipSpyder.get_66ip()
     
    
   
    
     
    
    
     
      # ipSpyder.get_kaixinip()
     
    
   
    
     
    
    
     
      # ipSpyder.get_goubanjiaip()
     
    
   
    
     
    
    
     
      # ipSpyder.get_kuaidaili()
     
    
   
    
     
    
    
     
      # ipSpyder.run_test_ip()
     
    
   
    
     
    
    
     
      #
     
    
   
    
     
    
    
     
      # time.localtime(statinfo)
     
    
   
    
     
    
    
     
      #print ('得到了一系列的IP代理，总共有 '+str(len(ipSpyder.ip_list_all))+' 个；')
     
    
   
    
     
    
    
     
      #print ('经过测试总共有 '+str(len(ipSpyder.ip_ok_list_all))+' 个IP代理可用；')
     
    
   
    
     
    
    
     
      #file = open("ip_avaliable.txt", 'w')
     
    
   
    
     
    
    
     
      # file.write(ip_ok_list_all)
     
    
   
    
     
    
    
     
      # file.close()

输出的范例如下：


  
   
    
     
    
    
     
      Success, already append 
      48 to the blogList!
     
    
   
    
     
    
    
     
      Success, already append 
      96 to the blogList!
     
    
   
    
     
    
    
     
      Success, already append 
      144 to the blogList!
     
    
   
    
     
    
    
     
      Success, already append 
      192 to the blogList!
     
    
   
    
     
    
    
     
      Success, already append 
      211 to the blogList!
     
    
   
    
     
    
    
     
      当前读取的博客地址是：【https://blog.csdn.net/yezonggang/article/details/
      105723456】
     
    
   
    
     
    
    
     
      当前读取的博客地址是：【数据挖掘算法和实践（一）：线性回归和逻辑回归（house_price数据集）_叶子叶来-CSDN博客】
     
    
   
    
     
    
    
     
      当前使用的代理IP是：【
      211.144
      .213
      .145:
      80】
     
    
   
    
     
    
    
     
      当前博客的阅读统计是:【_351_次】

转载：https://blog.csdn.net/yezonggang/article/details/113128082

查看评论

小言_互联网的博客

小言_互联网的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

python爬虫（五）：提高csdn博客访问量（ip proxy）

* 以上用户言论只代表其个人观点，不代表本网站的观点或立场