爬虫工具 - selenium_飞道的博客

爬虫工具 - selenium

2022-11-19 16:17 507人阅读评论(0)

一、selenium的基本使用

1. 安装（换元安装）：

在CMD窗口中输入：pip install selenium -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com

若提示pip命令不可以或安装失败，先使用命令 upgrade pip 更新pip安装命令

2. 案例 -- 打开百度输入“爬虫”搜索，并返回网页的一些信息


  
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.
      by 
      import By  
      # 选择器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.keys 
      import Keys   
      # 按钮
     
    
   
    
     
    
    
     
      from selenium.webdriver.support.wait 
      import WebDriverWait   
      # 等待页面加载完毕
     
    
   
    
     
    
    
     
      from selenium.webdriver.support 
      import expected_conditions 
      as EC
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 1.驱动浏览器
     
    
   
    
     
    
    
     
      brower = webdriver.Chrome()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 2.请求页面， --返回的数据封装在了browser对象里，不需要额外的变量接收
     
    
   
    
     
    
    
     
      brower.get(
      "http://www.baidu.com")   
      # 调用浏览器驱动访问站点
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 3.拿到输入框
     
    
   
    
     
    
    
     
      # text_input = brower.find_element_by_id('kw') # 方法1，不建议使用！！！
     
    
   
    
     
    
    
     
      text_input = brower.find_element(By.ID, 
      'kw')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 4.向输入框中写入内容
     
    
   
    
     
    
    
     
      text_input.send_keys(
      "爬虫")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 5.按下回车按钮
     
    
   
    
     
    
    
     
      text_input.send_keys(Keys.ENTER)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 等待事件，防止网速过慢
     
    
   
    
     
    
    
     
      wait = WebDriverWait(brower, 
      100)     
      # 参数1：浏览器对象，参数2：时间
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 等待某元素出现 presence_of_element_located()传入的参数格式是一个元组
     
    
   
    
     
    
    
     
      wait.
      until(EC.presence_of_element_located((By.ID, 
      'result_tts_player')))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      print(brower.current_url)    
      # 查看网页url
     
    
   
    
     
    
    
     
      print(brower.get_cookies())     
      # 查看cookie信息
     
    
   
    
     
    
    
     
      print(brower.page_source)      
      # 查看网页原码
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      brower.close()   
      # 关闭浏览器

3. 声明不同的浏览器对象


  
   
    
     
    
    
     
      from selenium import webdriver
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser 
      = webdriver.Chrome
      (
      )      
      # 谷歌浏览器，一般都使用Chrome
     
    
   
    
     
    
    
     
      browser 
      = webdriver.Firefox
      (
      )
     
    
   
    
     
    
    
     
      browser 
      = webdriver.PhantomJS
      (
      )
     
    
   
    
     
    
    
     
      browser 
      = webdriver.Safari
      (
      )

4. 访问页面


  
   
    
     
    
    
     
      from selenium 
      import webdriver
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 声明Chromeduix
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 请求页面
     
    
   
    
     
    
    
     
      browser.get(
      "https://www.taobao.com")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 获取网页源代码
     
    
   
    
     
    
    
     
      print(browser.page_source)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 关闭浏览器
     
    
   
    
     
    
    
     
      browser.close()

5. 查找元素

(1)查找单个元素的方法：

find_element(By.ID,"id") 根据id属性来定位

find_element(By.NAME,"name") 根据name元素来定位

find_element(By.XPATH,"xpath语法") 根据xpath语法来定位

find_elemnt(By.TAG_NAME,"input") 根据标签名来定位

find_element(By.CLASS_NAME,"classname") 根据class的名字来定位

find_element(By.CSS_SELECTOR,"#id") 根据css选择器来定位

find_element(By.LINK_TEXT,"text") 根据文本属性

案例1：


  
   
    
     
    
    
     
      from selenium import webdriver
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 单个元素
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.
      get(
      "http://www.baidu.com")
     
    
   
    
     
    
    
     
      browser.find_element(By.LINK_TEXT, 
      '新闻').click()      
      # 通过.click()点击目标链接
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.close()

案例2：


  
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.get(
      "http://www.taobao.com")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 1.通过元素ID查找
     
    
   
    
     
    
    
     
      by_id = browser.find_element(By.ID,
      'q')
     
    
   
    
     
    
    
     
      by_id.send_keys(
      '美食')
     
    
   
    
     
    
    
     
      print(by_id)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 2.通过css选择器查找
     
    
   
    
     
    
    
     
      css_select = browser.find_element(By.CSS_SELECTOR,
      '#q')
     
    
   
    
     
    
    
     
      css_select.send_keys(
      '美食')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 3.通过xpath查找
     
    
   
    
     
    
    
     
      xpath = browser.find_element(By.XPATH,
      '//*[@id="q"]')
     
    
   
    
     
    
    
     
      xpath.send_keys(
      '美食')
     
    
   
    
     
    
    
     
      browser.close()

(2)查找多个元素：

find_elements(By.ID,"id") 根据id属性来定位

find_elements(By.NAME,"name") 根据name元素来定位

find_elements(By.XPATH,"xpath语法") 根据xpath语法来定位

find_elemnts(By.TAG_NAME,"input") 根据标签名来定位

find_elements(By.CLASS_NAME,"classname") 根据class的名字来定位

find_elements(By.CSS_SELECTOR,"#id") 根据css选择器来定位

find_elements(By.LINK_TEXT,"text") 根据文本属性

案例：


  
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.
      by 
      import By  
      # 选择器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.get(
      "http://www.taobao.com")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 通过CSS选择器定位
     
    
   
    
     
    
    
     
      elements = browser.find_elements(By.CSS_SELECTOR, 
      '.service-bd li')
     
    
   
    
     
    
    
     
      # print(elements) # 以列表形式返回
     
    
   
    
     
    
    
     
      for e 
      in elements:
     
    
   
    
     
    
    
         
      print(e)

6. 元素的交换操作

案例：对获取的元素调用交换方法


  
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.
      by 
      import By  
      # 选择器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.get(
      "http://www.jd.com")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      text_input = browser.find_element(By.ID, 
      'key')
     
    
   
    
     
    
    
     
      text_input.send_keys(
      "iphone")
     
    
   
    
     
    
    
     
      time.sleep(
      2)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 清空原来的文本内容
     
    
   
    
     
    
    
     
      text_input.clear()
     
    
   
    
     
    
    
     
      text_input.send_keys(
      'iPad')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 找到按钮并单击
     
    
   
    
     
    
    
     
      button = browser.find_element(By.CLASS_NAME, 
      'button')
     
    
   
    
     
    
    
     
      button.click()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.close()

7. 交互动作ActionChains，将动作附加到动作链中串行执行


  
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.by 
      import By  
      # 选择器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      url = 
      "http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
     
    
   
    
     
    
    
     
      browser.get(url)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      """
     
    
   
    
     
    
    
     
       webDriver只能在一个页面上对元素识别和定位
     
    
   
    
     
    
    
     
       对于frame/iframe表单内嵌页面上的元素无法直接定位，
     
    
   
    
     
    
    
     
       此时就需要通过switch_to.frame()方法将当前定位的主题切换为iframe表单的内嵌页面中，
     
    
   
    
     
    
    
     
       switch_to.frame()默认可以直接提取表单的id和name属性
     
    
   
    
     
    
    
     
      """
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #.switch_to.frame()
     
    
   
    
     
    
    
     
      from selenium.webdriver 
      import ActionChains
     
    
   
    
     
    
    
     
      browser.switch_to.frame(
      'iframeResult')   
      # 将当前定位的主题切换为iframe表单的内嵌页面中
     
    
   
    
     
    
    
     
      A = browser.find_element(By.CSS_SELECTOR, 
      '#draggable')
     
    
   
    
     
    
    
     
      B = browser.find_element(By.CSS_SELECTOR, 
      '#droppable')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 产生一个动作执行器
     
    
   
    
     
    
    
     
      actions = ActionChains(browser)
     
    
   
    
     
    
    
     
      actions.drag_and_drop(A, B)     
      # A移动到B
     
    
   
    
     
    
    
     
      actions.perform()   
      # 执行动作链
     
    
   
    
     
    
    
     
      browser.close()

8. 执行JavaScript

selenium并不是万能的，有时候页面上操作无法实现的，这时候就需要借助JS来完成了

滚动页面方法execute_script() 该方法可调用原生JavaScript的api

滚动到底部：window.scrollTo(0,document.body.scrollHeight)

滚动到顶部：window.scrollTo(0,0)

说明：

window：js的window对象

scrollTo：window的方法，可以滚到页面的任何位置

scrollHeight：是dom元素的通用属性，document.body.scrollHeight会返回body元素的高度，基本上就是页面的高度

scrollLeft：获取位于对象左边界和窗口目前可见内容的最左端之间的距离

scrollTop：获取位于对象最顶端和窗口中可见内容的最顶端之间的距离

scrollWidth：获取对象滚动的宽度

案例：


  
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      driver = webdriver.Chrome()
     
    
   
    
     
    
    
     
      driver.get(
      'http://news.baidu.com')
     
    
   
    
     
    
    
     
      time.sleep(
      2)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 滚动到浏览器底部
     
    
   
    
     
    
    
     
      js = 
      "window.scrollTo(0,document.body.scrollHeight)"
     
    
   
    
     
    
    
     
      driver.execute_script(js)   
      # 执行js代码
     
    
   
    
     
    
    
     
      time.sleep(
      2)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 回到浏览器顶部
     
    
   
    
     
    
    
     
      js2 = 
      "window.scrollTo(0,0)"
     
    
   
    
     
    
    
     
      driver.execute_script(js2)

9. 获取元素信息

获取文本及其属性：.text文本值， get_attribute()根据属性获取属性值


  
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.
      by 
      import By  
      # 选择器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      url = 
      "https://www.zhihu.com/explore"
     
    
   
    
     
    
    
     
      browser.get(url)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 知乎，圆桌讨论
     
    
   
    
     
    
    
     
      l = browser.find_element(By.CSS_SELECTOR, 
      '.ExploreRoundtableCard.ExploreHomePage-roundtableCard .ExploreRoundtableCard-header .ExploreRoundtableCard-title')
     
    
   
    
     
    
    
     
      print(l)
     
    
   
    
     
    
    
     
      print(
      "--------------------------------------")
     
    
   
    
     
    
    
     
      # 返回的l是列表类型，可以遍历返回
     
    
   
    
     
    
    
     
      # for i in l:
     
    
   
    
     
    
    
     
      # print(i)
     
    
   
    
     
    
    
     
      # print(i.text)
     
    
   
    
     
    
    
     
      # print(i.get_attribute('href'))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 找单个元素
     
    
   
    
     
    
    
     
      logo = browser.find_element(By.XPATH,
      '//*[@id="special"]/div[2]/div/div[3]/div[1]/div[1]/a')
     
    
   
    
     
    
    
     
      print(logo)
     
    
   
    
     
    
    
     
      print(logo.text)
     
    
   
    
     
    
    
     
      print(logo.get_attribute(
      'href'))

10. 等待

等待特定元素出现后做某事，通常用于等待某个网页元素加载完毕后进行后续操作，避免出现异常。

EC模块的使用方法:

导包：from selenium.webdriver.support import expected_conditions as EC

title_is 标题是某内容

title_contains 标题包含某内容

presence_of_element_located 元素加载出，传入定位元组，如(By.ID, 'p')

visibility_of_element_located 元素可见，传入定位元组

visibility_of 可见，传入元素对象

presence_of_all_elements_located 所有元素加载出

text_to_be_present_in_element 某个元素文本包含某文字

text_to_be_present_in_element_value 某个元素值包含某文字

frame_to_be_available_and_switch_to_it frame加载并切换

invisibility_of_element_located 元素不可见

element_to_be_clickable 元素可点击

staleness_of 判断一个元素是否仍在DOM，可判断页面是否已经刷新

element_to_be_selected 元素可选择，传元素对象

element_located_to_be_selected 元素可选择，传入定位元组

element_selection_state_to_be 传入元素对象以及状态，相等返回True，否则返回False

element_located_selection_state_to_be 传入定位元组以及状态，相等返回True，否则返回False

alert_is_present 是否出现Alert

案例：


  
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.
      by 
      import By  
      # 选择器
     
    
   
    
     
    
    
     
      from selenium.webdriver.support.wait 
      import WebDriverWait   
      # 等待页面加载完毕
     
    
   
    
     
    
    
     
      from selenium.webdriver.support 
      import expected_conditions 
      as EC
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.get(
      "http://www.taobao.com")
     
    
   
    
     
    
    
     
      wait = WebDriverWait(browser, 
      100)
     
    
   
    
     
    
    
     
      # 等待特定元素加载完
     
    
   
    
     
    
    
     
      input = wait.
      until(EC.presence_of_element_located((By.ID, 
      'J_Toolkit')))
     
    
   
    
     
    
    
     
      print(input)

11. 前进后退


  
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      from selenium import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.
      get(
      "http://www.baidu.com")
     
    
   
    
     
    
    
     
      browser.
      get(
      "http://www.taobao.com")
     
    
   
    
     
    
    
     
      browser.
      get(
      "http://www.jd.com")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.back()          
      # 后退 淘宝
     
    
   
    
     
    
    
     
      time.sleep(
      3)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.forward()       
      # 前进 京东
     
    
   
    
     
    
    
     
      time.sleep(
      3)

12. 选项卡管理

窗口切换 switch_to_window(窗口ID) switch_to.window(窗口ID) (python3.8以上版本都支持，python3.7只支持后者写法)

查看所有窗口ID window_handles

FAQ：只有切换到当前窗口时，才能操作当前窗口（比如翻页、获取源代码等等）

案例：


  
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      from selenium 
      import webdriver   # 驱动浏览器
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      browser.
      get(
      "http://www.baidu.com")
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 调用原生JavaScript的api接口
     
    
   
    
     
    
    
     
      browser.execute_script(
      'window.open()')   # 选项卡
      1 窗口
      1
     
    
   
    
     
    
    
     
      time.sleep(
      1)
     
    
   
    
     
    
    
     
      browser.execute_script(
      'window.open()')   # 选项卡
      2 窗口
      2
     
    
   
    
     
    
    
     
      print(browser.window_handles)  # 查看当前浏览器所有窗口ID
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 给新选项卡窗口访问目标站点 .
      switch.to.
      window
     
    
   
    
     
    
    
     
      browser.switch_to.
      window(browser.window_handles[
      0])
     
    
   
    
     
    
    
     
      browser.
      get(
      'https://www.mi.com/')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.switch_to.
      window(browser.window_handles[
      1]) #加载窗口
      2 切换到窗口
      2
     
    
   
    
     
    
    
     
      browser.
      get(
      'https://www.taobao.com') # 窗口
      2  打开淘宝
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.switch_to.
      window(browser.window_handles[
      2])
     
    
   
    
     
    
    
     
      browser.
      get(
      'https://jd.com')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser.switch_to.
      window(browser.window_handles[
      1])
     
    
   
    
     
    
    
     
      browser.page_source

13. 异常处理

异常处理模块所在位置：from selenium.common.exceptions import TimeoutException, NoSuchElementException

案例：


  
   
    
     
    
    
     
      from selenium 
      import webdriver   
      # 驱动浏览器
     
    
   
    
     
    
    
     
      from selenium.webdriver.common.by 
      import By  
      # 选择器
     
    
   
    
     
    
    
     
      from selenium.common.exceptions 
      import TimeoutException, NoSuchElementException
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      browser = webdriver.Chrome()
     
    
   
    
     
    
    
     
      try:
     
    
   
    
     
    
    
     
          browser.get(
      'https://www.baidu.com')
     
    
   
    
     
    
    
     
      except TimeoutException:
     
    
   
    
     
    
    
         
      print(
      'Time out')
     
    
   
    
     
    
    
     
      try:
     
    
   
    
     
    
    
     
          browser.find_element(By.ID,
      'hello')
     
    
   
    
     
    
    
     
      except NoSuchElementException:
     
    
   
    
     
    
    
         
      print(
      'No Element')
     
    
   
    
     
    
    
     
      finally:   
      #无论try语句中是否抛出异常，finally中的语句一定会被执行
     
    
   
    
     
    
    
     
          browser.close()

补充：

设为开发者模式（无头模式），避免被监测导致爬虫失败，只需添加参数
        options = webdriver.ChromeOptions() # 配置对象
        options.add_experimental_option('excludeSwitches', ['enable-automation']) # 写入参数
        browser = webdriver.Chrome(options=options)
        browser.get('https://www.baidu.com')

转载：https://blog.csdn.net/qq_48051316/article/details/127929257

查看评论

飞道的博客

飞道的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

爬虫工具 - selenium

* 以上用户言论只代表其个人观点，不代表本网站的观点或立场