针对爬虫首先声明只是哥玩具爬虫,得到自己的所有博客地址,然后随机访问,最后增加TamperMonkey插件
python爬虫
思想很简单,包含了2个类IPSpyder和CSDN类,前者保证一周内get一次ip代理到本地,后者包含3个方法负责随机读取博客,getBlogList()方法的输入是个人博客的主页地址,输出是个人博客所有的链接,getBlogTitleAndCount()的输入时单个博客的url地址,拿到当前博客的访问量和标题,输出;
后续优化:
- 增加tdqm的进度条显示;
- 考虑多线程方式
IP代理的爬虫参考:爬取IP代理
  
   - 
    
     
    
    
     
      import requests
     
    
- 
    
     
    
    
     
      import lxml
     
    
- 
    
     
    
    
     
      from bs4 
      import BeautifulSoup
     
    
- 
    
     
    
    
     
      import os
     
    
- 
    
     
    
    
     
      import string
     
    
- 
    
     
    
    
     
      import random
     
    
- 
    
     
    
    
     
      import time
     
    
- 
    
     
    
    
     
      import aiohttp
     
    
- 
    
     
    
    
     
      import asyncio
     
    
- 
    
     
    
    
     
      from tqdm 
      import tqdm
     
    
- 
    
     
    
    
     
      import os
     
    
- 
    
     
    
    
     
      import datetime
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      class IPSpyder(object):
     
    
- 
    
     
    
    
         
      def __init__(self):
     
    
- 
    
     
    
    
     
              self.headers = {
      'User-Agent': 
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
     
    
- 
    
     
    
    
                             
      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
     
    
- 
    
     
    
    
     
              self.sixsix_url_range = 
      35
     
    
- 
    
     
    
    
     
              self.kaixin_url_range = 
      2
     
    
- 
    
     
    
    
     
              self.kuai_url_range = 
      2
     
    
- 
    
     
    
    
     
              self.ip_list_all = []
     
    
- 
    
     
    
    
     
              self.ip_ok_list_all = []
     
    
- 
    
     
    
    
     
              self.url = 
      'https://blog.csdn.net/yezonggang/article/details/112991188'
     
    
- 
    
     
    
    
     
              self.ip_avaliable_file = 
      'F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt'
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      def get_html(self, url, flag):
     
    
- 
    
     
    
    
             
      try:
     
    
- 
    
     
    
    
     
                  headers = self.headers
     
    
- 
    
     
    
    
     
                  response = requests.get(url, headers=headers)
     
    
- 
    
     
    
    
     
                  response.raise_for_status()
     
    
- 
    
     
    
    
                 
      if flag:
     
    
- 
    
     
    
    
     
                      response.encoding = 
      'utf-8'
     
    
- 
    
     
    
    
                 
      else:
     
    
- 
    
     
    
    
     
                      response.encoding = 
      'gb2312'
     
    
- 
    
     
    
    
                 
      return response.text
     
    
- 
    
     
    
    
             
      except Exception 
      as err:
     
    
- 
    
     
    
    
                 
      return 
      '请求异常'
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      def get_66ip(self):
     
    
- 
    
     
    
    
             
      #ip_list = []
     
    
- 
    
     
    
    
             
      for index 
      in range(
      1, self.sixsix_url_range):
     
    
- 
    
     
    
    
     
                  count = 
      0
     
    
- 
    
     
    
    
     
                  province = 
      ''
     
    
- 
    
     
    
    
     
                  url = 
      'http://www.66ip.cn/areaindex_{}/1.html'.format(index)
     
    
- 
    
     
    
    
     
                  html = self.get_html(url, flag=
      False)
     
    
- 
    
     
    
    
     
                  soup = BeautifulSoup(html, 
      'lxml')
     
    
- 
    
     
    
    
     
                  tr_list = soup.find_all(name=
      'tr')
     
    
- 
    
     
    
    
                 
      for tr_ 
      in tr_list[
      2:]:
     
    
- 
    
     
    
    
     
                      td_list = tr_.find_all(name=
      'td')
     
    
- 
    
     
    
    
     
                      ip = td_list[
      0].string
     
    
- 
    
     
    
    
     
                      port = td_list[
      1].string
     
    
- 
    
     
    
    
     
                      province = td_list[
      2].string
     
    
- 
    
     
    
    
     
                      ip_port = ip + 
      ':' + port
     
    
- 
    
     
    
    
     
                      self.ip_list_all.append(ip_port)
     
    
- 
    
     
    
    
     
                      count += 
      1
     
    
- 
    
     
    
    
     
                  print(
      'Saved {0} {1} ip.'.format(province, count))
     
    
- 
    
     
    
    
                 
      # 速度不要太快哦!, 否则获取不到页面内容
     
    
- 
    
     
    
    
     
                  time.sleep(
      3)
     
    
- 
    
     
    
    
     
              print(
      '66 daili Finished!!!')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      def get_kaixinip(self):
     
    
- 
    
     
    
    
             
      #ip_list = []
     
    
- 
    
     
    
    
             
      for index 
      in range(
      1, self.kaixin_url_range):
     
    
- 
    
     
    
    
     
                  count = 
      0
     
    
- 
    
     
    
    
     
                  url = 
      'http://www.kxdaili.com/dailiip/1/{}.html'.format(index)
     
    
- 
    
     
    
    
     
                  html = self.get_html(url, 
      False)
     
    
- 
    
     
    
    
     
                  soup = BeautifulSoup(html, 
      'lxml')
     
    
- 
    
     
    
    
     
                  tr_list = soup.find_all(name=
      'tr')
     
    
- 
    
     
    
    
                 
      for tr_ 
      in tr_list[
      2:]:
     
    
- 
    
     
    
    
     
                      td_list = tr_.find_all(name=
      'td')
     
    
- 
    
     
    
    
     
                      ip = td_list[
      0].string
     
    
- 
    
     
    
    
     
                      port = td_list[
      1].string
     
    
- 
    
     
    
    
     
                      ip_port = ip + 
      ':' + port
     
    
- 
    
     
    
    
     
                      self.ip_list_all.append(ip_port)
     
    
- 
    
     
    
    
     
                      count += 
      1
     
    
- 
    
     
    
    
     
                  print(
      'Saved {0} page {1} ip.'.format(index, count))
     
    
- 
    
     
    
    
                 
      # 速度不要太快哦!, 否则获取不到页面内容
     
    
- 
    
     
    
    
     
                  time.sleep(
      3)
     
    
- 
    
     
    
    
     
              print(
      'kaixindaili Finished!!!')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      def get_goubanjiaip(self):
     
    
- 
    
     
    
    
             
      #ip_list = []
     
    
- 
    
     
    
    
     
              url = 
      'http://www.goubanjia.com/'
     
    
- 
    
     
    
    
     
              html = self.get_html(url, 
      False)
     
    
- 
    
     
    
    
     
              soup = BeautifulSoup(html, 
      'lxml')
     
    
- 
    
     
    
    
     
              td_list = soup.find_all(class_=
      'ip')
     
    
- 
    
     
    
    
             
      for td_ 
      in td_list:
     
    
- 
    
     
    
    
     
                  ip_ = 
      ''
     
    
- 
    
     
    
    
                 
      for child 
      in td_.children:
     
    
- 
    
     
    
    
                     
      if child == 
      ':':
     
    
- 
    
     
    
    
     
                          ip_ += child
     
    
- 
    
     
    
    
                     
      elif 
      not child.attrs:
     
    
- 
    
     
    
    
     
                          ip_ += child.get_text()
     
    
- 
    
     
    
    
                     
      elif list(child.attrs.keys())[
      0] == 
      'class':
     
    
- 
    
     
    
    
     
                          ip_ = ip_ + child.get_text()
     
    
- 
    
     
    
    
                     
      elif child.attrs[
      'style'] == 
      'display:inline-block;' 
      or child.attrs[
      'style'] == 
      'display: inline-block;':
     
    
- 
    
     
    
    
     
                          ip_ += child.get_text()
     
    
- 
    
     
    
    
     
                  self.ip_list_all.append(ip_)
     
    
- 
    
     
    
    
     
              print(
      'quanwang daili Finished!!!')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      # 快代理
     
    
- 
    
     
    
    
         
      def get_kuaidaili(self):
     
    
- 
    
     
    
    
             
      #ip_list = []
     
    
- 
    
     
    
    
             
      for index 
      in range(
      1, self.kuai_url_range):
     
    
- 
    
     
    
    
     
                  count = 
      0
     
    
- 
    
     
    
    
     
                  url = 
      'https://www.kuaidaili.com/free/inha/{}/'.format(index)
     
    
- 
    
     
    
    
     
                  html = self.get_html(url, 
      False)
     
    
- 
    
     
    
    
     
                  soup = BeautifulSoup(html, 
      'lxml')
     
    
- 
    
     
    
    
     
                  tr_list = soup.find_all(name=
      'tr')
     
    
- 
    
     
    
    
                 
      for tr_ 
      in tr_list[
      1:]:
     
    
- 
    
     
    
    
     
                      td_list = tr_.find_all(name=
      'td')
     
    
- 
    
     
    
    
     
                      ip = td_list[
      0].string
     
    
- 
    
     
    
    
     
                      port = td_list[
      1].string
     
    
- 
    
     
    
    
     
                      ip_port = ip + 
      ':' + port
     
    
- 
    
     
    
    
     
                      self.ip_list_all.append(ip_port)
     
    
- 
    
     
    
    
     
                      count += 
      1
     
    
- 
    
     
    
    
     
                  print(
      'Saved {0} page {1} ip.'.format(index, count))
     
    
- 
    
     
    
    
                 
      # 速度不要太快哦!, 否则获取不到页面内容
     
    
- 
    
     
    
    
     
                  time.sleep(
      3)
     
    
- 
    
     
    
    
     
              print(
      'kuaidaili Finished!!!')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      async 
      def test_ip(self, ip_, url):
     
    
- 
    
     
    
    
             
      #global ip_ok
     
    
- 
    
     
    
    
     
              conn = aiohttp.TCPConnector(verify_ssl=
      False)
     
    
- 
    
     
    
    
             
      async 
      with aiohttp.ClientSession(connector=conn) 
      as session:
     
    
- 
    
     
    
    
                 
      try:
     
    
- 
    
     
    
    
     
                      proxy_ip = 
      'http://' + ip_
     
    
- 
    
     
    
    
     
                      print(
      '正在测试: ' + proxy_ip)
     
    
- 
    
     
    
    
                     
      async 
      with session.get(url=url, headers=self.headers, proxy=proxy_ip, timeout=
      15) 
      as response:
     
    
- 
    
     
    
    
                         
      if response.status == 
      200:
     
    
- 
    
     
    
    
     
                              print(
      '代理可用: ' + ip_)
     
    
- 
    
     
    
    
     
                              self.ip_ok_list_all.append(ip_)
     
    
- 
    
     
    
    
                         
      else:
     
    
- 
    
     
    
    
     
                              print(
      '请求响应码不合法 ' + ip_)
     
    
- 
    
     
    
    
                 
      except:
     
    
- 
    
     
    
    
     
                      print(
      '代理请求失败', ip_)
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      def run_test_ip_write_to_file(self):
     
    
- 
    
     
    
    
             
      #csdn 点赞关注私聊发^-^
     
    
- 
    
     
    
    
     
              print(
      'csdn 点赞关注私聊发')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      # 我的博客列表,后面要跟翻页list/1
     
    
- 
    
     
    
    
     
      # 我的博客列表有几页?
     
    
- 
    
     
    
    
     
      # header
     
    
- 
    
     
    
    
     
      # 定义一个类 CSDN
     
    
- 
    
     
    
    
     
      # csdn_url='https://blog.csdn.net/yezonggang/article/details/106344148'
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      class CSDN(object):
     
    
- 
    
     
    
    
         
      # 类的静态变量
     
    
- 
    
     
    
    
         
      def __init__(self):
     
    
- 
    
     
    
    
     
              self.my_csdn = 
      'https://blog.csdn.net/yezonggang/article/list/'
     
    
- 
    
     
    
    
     
              self.my_list = 
      5
     
    
- 
    
     
    
    
     
              self.csdn_url = 
      ''
     
    
- 
    
     
    
    
     
              self.proxies = [{
      'http': 
      'socks5://183.195.106.118:8118'}]
     
    
- 
    
     
    
    
     
              self.blogList = []
     
    
- 
    
     
    
    
     
              self.headers = {
      'User-Agent': 
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
     
    
- 
    
     
    
    
                             
      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
     
    
- 
    
     
    
    
     
                              }
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      # 从博客首页进去,遍历得到我的博客列表,把博客地址塞进self.blogList[]
     
    
- 
    
     
    
    
         
      def getBlogList(self):
     
    
- 
    
     
    
    
     
              i = 
      1
     
    
- 
    
     
    
    
     
              print(
      '-------------------------------begin----------------------------')
     
    
- 
    
     
    
    
             
      while(i <= self.my_list):
     
    
- 
    
     
    
    
     
                  response = requests.get(self.my_csdn+str(i), headers=self.headers)
     
    
- 
    
     
    
    
     
                  response.enconding = 
      'utf-8'
     
    
- 
    
     
    
    
     
                  conent = response.content.decode(
      'utf-8')
     
    
- 
    
     
    
    
     
                  soup = BeautifulSoup(conent, 
      'lxml')
     
    
- 
    
     
    
    
     
                  a_tag_content = soup.findAll(
      'a')
     
    
- 
    
     
    
    
                 
      for a_tag 
      in a_tag_content:
     
    
- 
    
     
    
    
     
                      a_tag_content = str(a_tag.get(
      'href'))
     
    
- 
    
     
    
    
                     
      if(
      'details' 
      in a_tag_content 
      and 
      'comments' 
      not 
      in a_tag_content):
     
    
- 
    
     
    
    
     
                          self.blogList.append(a_tag_content)
     
    
- 
    
     
    
    
                         
      #print (a_tag_content)
     
    
- 
    
     
    
    
     
                  print(
      'Success, already append ' +
     
    
- 
    
     
    
    
     
                        str(len(self.blogList)) + 
      ' to the blogList!')
     
    
- 
    
     
    
    
     
                  i = i+
      1
     
    
- 
    
     
    
    
             
      # print (self.blogList)
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      # 随机遍历self.blogList[]里面的博客链接,得到博客的标题和次数,并输出
     
    
- 
    
     
    
    
         
      def getBlogTitleAndCount(self, proxy):
     
    
- 
    
     
    
    
     
              proxy_support = {
     
    
- 
    
     
    
    
                 
      'http': 
      'http://'+proxy,
     
    
- 
    
     
    
    
                 
      'https': 
      'https://'+proxy,
     
    
- 
    
     
    
    
     
              }
     
    
- 
    
     
    
    
     
              response = requests.get(
     
    
- 
    
     
    
    
     
                  self.csdn_url, headers=self.headers, proxies=proxy_support)
     
    
- 
    
     
    
    
     
              response.enconding = 
      'utf-8'
     
    
- 
    
     
    
    
     
              conent = response.content.decode(
      'utf-8')
     
    
- 
    
     
    
    
     
              soup = BeautifulSoup(conent, 
      'lxml')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
             
      # 得到当前博客的标题:数据挖掘算法和实践(二十一):kaggle经典-职场离职率分析案例解读
     
    
- 
    
     
    
    
     
              blog_title = soup.title.string
     
    
- 
    
     
    
    
             
      # 得到当前博客的访问量统计值,显示出来
     
    
- 
    
     
    
    
     
              blog_counts = soup.find_all(
      'span')
     
    
- 
    
     
    
    
             
      for blog_count 
      in blog_counts:
     
    
- 
    
     
    
    
     
                  blog_count_single_class = blog_count.get(
      'class')
     
    
- 
    
     
    
    
                 
      if(blog_count_single_class 
      is 
      not 
      None 
      and blog_count_single_class[
      0] == 
      'read-count'):
     
    
- 
    
     
    
    
     
                      blog_count_now = blog_count.string
     
    
- 
    
     
    
    
     
              print(
      '当前读取的博客地址是:【'+self.csdn_url+
      '】\n' +
     
    
- 
    
     
    
    
                   
      '当前读取的博客地址是:【'+blog_title + 
      '】\n' +
     
    
- 
    
     
    
    
                   
      '当前使用的代理IP是:【'+proxy + 
      '】\n' +
     
    
- 
    
     
    
    
                   
      '当前博客的阅读统计是:【_' + blog_count_now + 
      '_次】')
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
         
      def beginTO(self, proxy):
     
    
- 
    
     
    
    
     
              self.getBlogList()
     
    
- 
    
     
    
    
     
              self.csdn_url = random.choice(self.blogList)
     
    
- 
    
     
    
    
     
              self.getBlogTitleAndCount(proxy)
     
    
- 
    
     
    
    
             
      #random_time=random.uniform(sleepTimeMin, sleepTimeMax)
     
    
- 
    
     
    
    
             
      #print("Begin to sleep now,Sleep time: "+str(random_time))
     
    
- 
    
     
    
    
             
      # time.sleep(random_time)
     
    
- 
    
     
    
    
     
              self.blogList = []
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      # 逻辑开始,首先判定本地的可用ip文件的创建戳是不是超过1周或者文件是空,若是就重新刷新,不然直接开始刷;
     
    
- 
    
     
    
    
     
      ip_avaliable = 
      "F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt"
     
    
- 
    
     
    
    
     
      mtime = os.stat(ip_avaliable).st_ctime
     
    
- 
    
     
    
    
     
      # 如果文件存在,并且创建时间是7天内,并且非空
     
    
- 
    
     
    
    
     
      if(
      not os.path.exists(ip_avaliable) 
      or ((time.time()-mtime)/(
      3600*
      24) > 
      7) 
      or 
      not os.path.getsize(ip_avaliable)):
     
    
- 
    
     
    
    
         
      # 先刷代理后刷博客
     
    
- 
    
     
    
    
     
          ipSpyder = IPSpyder()
     
    
- 
    
     
    
    
     
          ipSpyder.get_66ip()
     
    
- 
    
     
    
    
         
      #ipSpyder.get_kaixinip()
     
    
- 
    
     
    
    
         
      #ipSpyder.get_goubanjiaip()
     
    
- 
    
     
    
    
         
      #ipSpyder.get_kuaidaili()
     
    
- 
    
     
    
    
     
          ipSpyder.run_test_ip_write_to_file()
     
    
- 
    
     
    
    
     
      # 直接调用开始刷
     
    
- 
    
     
    
    
     
      file_ip = open(ip_avaliable, 
      'r')
     
    
- 
    
     
    
    
     
      ip_avaliable_list = file_ip.read().split(
      ",")
     
    
- 
    
     
    
    
     
      file_ip.close()
     
    
- 
    
     
    
    
     
      # print(ip_avaliable_list)
     
    
- 
    
     
    
    
     
      proxy_now = random.choice(ip_avaliable_list)
     
    
- 
    
     
    
    
     
      csdn = CSDN()
     
    
- 
    
     
    
    
     
      while 
      True:
     
    
- 
    
     
    
    
     
          print(
      'csdn 点赞关注私聊发')
     
    
- 
    
     
    
    
     
          csdn.beginTO(proxy_now)
     
    
- 
    
     
    
    
     
          time.sleep(
      10)
     
    
- 
    
     
    
    
         
      #csdn 点赞关注私聊发^-^
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
      
     
    
- 
    
     
    
    
     
      #ipSpyder =IPSpyder()
     
    
- 
    
     
    
    
     
      # ipSpyder.get_66ip()
     
    
- 
    
     
    
    
     
      # ipSpyder.get_kaixinip()
     
    
- 
    
     
    
    
     
      # ipSpyder.get_goubanjiaip()
     
    
- 
    
     
    
    
     
      # ipSpyder.get_kuaidaili()
     
    
- 
    
     
    
    
     
      # ipSpyder.run_test_ip()
     
    
- 
    
     
    
    
     
      #
     
    
- 
    
     
    
    
     
      # time.localtime(statinfo)
     
    
- 
    
     
    
    
     
      #print ('得到了一系列的IP代理,总共有 '+str(len(ipSpyder.ip_list_all))+' 个;')
     
    
- 
    
     
    
    
     
      #print ('经过测试总共有 '+str(len(ipSpyder.ip_ok_list_all))+' 个IP代理可用;')
     
    
- 
    
     
    
    
     
      #file = open("ip_avaliable.txt", 'w')
     
    
- 
    
     
    
    
     
      # file.write(ip_ok_list_all)
     
    
- 
    
     
    
    
     
      # file.close()
     
    

输出的范例如下:
  
   - 
    
     
    
    
     
      Success, already append 
      48 to the blogList!
     
    
- 
    
     
    
    
     
      Success, already append 
      96 to the blogList!
     
    
- 
    
     
    
    
     
      Success, already append 
      144 to the blogList!
     
    
- 
    
     
    
    
     
      Success, already append 
      192 to the blogList!
     
    
- 
    
     
    
    
     
      Success, already append 
      211 to the blogList!
     
    
- 
    
     
    
    
     
      当前读取的博客地址是:【https://blog.csdn.net/yezonggang/article/details/
      105723456】
     
    
- 
    
     
    
    
     
      当前读取的博客地址是:【数据挖掘算法和实践(一):线性回归和逻辑回归(house_price数据集)_叶子叶来-CSDN博客】
     
    
- 
    
     
    
    
     
      当前使用的代理IP是:【
      211.144
      .213
      .145:
      80】
     
    
- 
    
     
    
    
     
      当前博客的阅读统计是:【_351_次】
     
    
转载:https://blog.csdn.net/yezonggang/article/details/112991188
查看评论
					