飞道的博客

十一个爆火的Python爬虫实战项目源码【不用谢】

244人阅读  评论(0)

目录

Python爬虫:爬取豆瓣电影中速度与激情8演员图片

Python爬虫:斗鱼弹幕相关信息保存到mongodb

Python爬虫:抓取喜马拉雅电台音频

Python爬虫—抓包分析爬取实习僧全部招聘信息

Python爬虫:批量抓取花瓣网高清美图并保存

Python爬虫:爬取v2ex数据用csv保存

Python爬虫:豌豆荚设计奖三种爬取方法速度对比

Python爬虫:使用lxml解析HTML,输出对应值

Python爬虫:使用Selenium爬取一点资讯动态数据

Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb

Python爬虫:获取黑大验证码并登录


Python爬虫:爬取豆瓣电影中速度与激情8演员图片


  
  1. import urllib.request
  2. import os
  3. import re
  4. def douban(url):
  5. r = urllib.request.urlopen(url)
  6. html = r.read().decode( 'utf-8')
  7. result = re.findall( r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
  8. result2 = re.findall( r'(?<=title=").\S+', html)
  9. result2.pop()
  10. result3 = sorted(set(result2), key=result2.index)
  11. result3.pop( -3)
  12. if not os.path.exists( 'douban'):
  13. os.makedirs( 'douban')
  14. i = 0
  15. for link in result:
  16. filename = 'douban\\' + str(result3[i]) + '.jpg'
  17. i += 1
  18. with open(filename, 'w') as file:
  19. urllib.request.urlretrieve(link, filename)
  20. url = 'https://movie.douban.com/subject/26260853/celebrities'
  21. if __name__ == '__main__':
  22. douban(url)

Python爬虫:斗鱼弹幕相关信息保存到mongodb


  
  1. # 这个抓取弹幕,然后把用户的uid,昵称,等级,弹幕内容都保存到mongodb中
  2. __author__ = '布咯咯_rieuse'
  3. __time__ = '2017.6.2'
  4. __github__ = 'https://github.com/rieuse'
  5. import multiprocessing
  6. import re
  7. import socket
  8. import time
  9. import pymongo
  10. import requests
  11. from bs4 import BeautifulSoup
  12. clients = pymongo.MongoClient( 'localhost')
  13. db = clients[ "DouyuTV_danmu"]
  14. col = db[ "info"]
  15. client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  16. host = socket.gethostbyname( "openbarrage.douyutv.com")
  17. port = 8601
  18. client.connect((host, port))
  19. danmu_path = re.compile( b'txt@=(.+?)/cid@')
  20. uid_path = re.compile( b'uid@=(.+?)/nn@')
  21. nickname_path = re.compile( b'nn@=(.+?)/txt@')
  22. level_path = re.compile( b'level@=([1-9][0-9]?)/sahf')
  23. def sendmsg(msgstr):
  24. msg = msgstr.encode( 'utf-8')
  25. data_length = len(msg) + 8
  26. code = 689
  27. msgHead = int.to_bytes(data_length, 4, 'little') \
  28. + int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little')
  29. client.send(msgHead)
  30. sent = 0
  31. while sent < len(msg):
  32. tn = client.send(msg[sent:])
  33. sent = sent + tn
  34. def start(roomid):
  35. msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid)
  36. sendmsg(msg)
  37. msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
  38. sendmsg(msg_more)
  39. print( '---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
  40. while True:
  41. data = client.recv( 1024)
  42. uid_more = uid_path.findall(data)
  43. nickname_more = nickname_path.findall(data)
  44. level_more = level_path.findall(data)
  45. danmu_more = danmu_path.findall(data)
  46. if not level_more:
  47. level_more = b'0'
  48. if not data:
  49. break
  50. else:
  51. for i in range( 0, len(danmu_more)):
  52. try:
  53. product = {
  54. 'uid': uid_more[ 0].decode(encoding= 'utf-8'),
  55. 'nickname': nickname_more[ 0].decode(encoding= 'utf-8'),
  56. 'level': level_more[ 0].decode(encoding= 'utf-8'),
  57. 'danmu': danmu_more[ 0].decode(encoding= 'utf-8')
  58. }
  59. print(product)
  60. col.insert(product)
  61. print( '成功导入mongodb')
  62. except Exception as e:
  63. print(e)
  64. def keeplive():
  65. while True:
  66. msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0'
  67. sendmsg(msg)
  68. time.sleep( 15)
  69. def get_name(roomid):
  70. r = requests.get( "http://www.douyu.com/" + roomid)
  71. soup = BeautifulSoup(r.text, 'lxml')
  72. return soup.find( 'a', { 'class', 'zb-name'}).string
  73. if __name__ == '__main__':
  74. room_id = input( '请出入房间ID: ')
  75. p1 = multiprocessing.Process(target=start, args=(room_id,))
  76. p2 = multiprocessing.Process(target=keeplive)
  77. p1.start()
  78. p2.start()

Python爬虫:抓取喜马拉雅电台音频


  
  1. _author__ = '布咯咯_rieuse'
  2. import json
  3. import random
  4. import time
  5. import pymongo
  6. import requests
  7. from bs4 import BeautifulSoup
  8. from lxml import etree
  9. clients = pymongo.MongoClient( 'localhost')
  10. db = clients[ "XiMaLaYa"]
  11. col1 = db[ "album2"]
  12. col2 = db[ "detaile2"]
  13. UA_LIST = [
  14. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  15. "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  16. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  17. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  18. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  19. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  20. "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  21. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  22. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  23. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
  24. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  25. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  26. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  27. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  28. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
  29. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  30. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  31. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  32. "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  33. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  34. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  35. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  36. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  37. "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  38. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  39. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  40. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  41. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  42. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  43. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  44. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  45. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  46. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
  47. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  48. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
  49. ]
  50. headers1 = {
  51. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  52. 'Accept-Encoding': 'gzip, deflate, sdch',
  53. 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
  54. 'Cache-Control': 'max-age=0',
  55. 'Proxy-Connection': 'keep-alive',
  56. 'Upgrade-Insecure-Requests': '1',
  57. 'User-Agent': random.choice(UA_LIST)
  58. }
  59. headers2 = {
  60. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  61. 'Accept-Encoding': 'gzip, deflate, sdch',
  62. 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
  63. 'Cache-Control': 'max-age=0',
  64. 'Proxy-Connection': 'keep-alive',
  65. 'Referer': 'http://www.ximalaya.com/dq/all/2',
  66. 'Upgrade-Insecure-Requests': '1',
  67. 'User-Agent': random.choice(UA_LIST)
  68. }
  69. def get_url():
  70. start_urls = [ 'http://www.ximalaya.com/dq/all/{}'.format(num) for num in range( 1, 85)]
  71. for start_url in start_urls:
  72. html = requests.get(start_url, headers=headers1).text
  73. soup = BeautifulSoup(html, 'lxml')
  74. for item in soup.find_all(class_= "albumfaceOutter"):
  75. content = {
  76. 'href': item.a[ 'href'],
  77. 'title': item.img[ 'alt'],
  78. 'img_url': item.img[ 'src']
  79. }
  80. col1.insert(content)
  81. print( '写入一个频道' + item.a[ 'href'])
  82. print(content)
  83. another(item.a[ 'href'])
  84. time.sleep( 1)
  85. def another(url):
  86. html = requests.get(url, headers=headers2).text
  87. ifanother = etree.HTML(html).xpath( '//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
  88. if len(ifanother):
  89. num = ifanother[ 0]
  90. print( '本频道资源存在' + num + '个页面')
  91. for n in range( 1, int(num)):
  92. print( '开始解析{}个中的第{}个页面'.format(num, n))
  93. url2 = url + '?page={}'.format(n)
  94. get_m4a(url2)
  95. get_m4a(url)
  96. def get_m4a(url):
  97. time.sleep( 1)
  98. html = requests.get(url, headers=headers2).text
  99. numlist = etree.HTML(html).xpath( '//div[@class="personal_body"]/@sound_ids')[ 0].split( ',')
  100. for i in numlist:
  101. murl = 'http://www.ximalaya.com/tracks/{}.json'.format(i)
  102. html = requests.get(murl, headers=headers1).text
  103. dic = json.loads(html)
  104. col2.insert(dic)
  105. print(murl + '中的数据已被成功插入mongodb')
  106. if __name__ == '__main__':
  107. get_url()

Python爬虫—抓包分析爬取实习僧全部招聘信息


  
  1. import json
  2. import requests
  3. import pymongo
  4. import time
  5. clients = pymongo.MongoClient( 'localhost')
  6. db = clients[ "Shixiseng"]
  7. col = db[ "detail_info"]
  8. urls = [ 'http://www.shixiseng.com/app/internsvt?c=%E5%85%A8%E5%9B%BD&p={}&t=hot'.format(n) for n in range( 1, 3487)]
  9. for url in urls:
  10. print(url)
  11. r = requests.get(url)
  12. html = r.content.decode( 'utf-8')
  13. content = json.loads(html)[ 'msg'][ 'b']
  14. for i in content:
  15. print( '插入一条数据:')
  16. print(i)
  17. col.insert(i)
  18. time.sleep( 0.01)

Python爬虫:批量抓取花瓣网高清美图并保存


  
  1. __author__ = '布咯咯_rieuse'
  2. import os
  3. import lxml.html
  4. import requests
  5. from selenium import webdriver
  6. from selenium.webdriver.common.by import By
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. SERVICE_ARGS = [ '--load-images=false', '--disk-cache=true']
  10. browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
  11. # browser = webdriver.Firefox()
  12. wait = WebDriverWait(browser, 5)
  13. browser.set_window_size( 1400, 900)
  14. def parser(url, param):
  15. # 解析模块
  16. browser.get(url)
  17. wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param)))
  18. html = browser.page_source
  19. doc = lxml.html.fromstring(html)
  20. return doc
  21. def get_main_url():
  22. print( '打开主页搜寻链接中...')
  23. try:
  24. doc = parser( 'http://huaban.com/boards/favorite/beauty/', '#waterfall')
  25. name = doc.xpath( '//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
  26. u = doc.xpath( '//*[@id="waterfall"]/div/a[1]/@href')
  27. for item, fileName in zip(u, name):
  28. main_url = 'http://huaban.com' + item
  29. print( '主链接已找到' + main_url)
  30. if '*' in fileName:
  31. fileName = fileName.replace( '*', '')
  32. download(main_url, fileName)
  33. except Exception as e:
  34. print(e)
  35. def download(main_url, fileName):
  36. print( '-------准备下载中-------')
  37. try:
  38. doc = parser(main_url, '#waterfall')
  39. if not os.path.exists( 'image\\' + fileName):
  40. print( '创建文件夹...')
  41. os.makedirs( 'image\\' + fileName)
  42. link = doc.xpath( '//*[@id="waterfall"]/div/a/@href')
  43. # print(link)
  44. i = 0
  45. for item in link:
  46. i += 1
  47. minor_url = 'http://huaban.com' + item
  48. doc = parser(minor_url, '#pin_view_page')
  49. img_url = doc.xpath( '//*[@id="baidu_image_holder"]/a/img/@src')
  50. img_url2 = doc.xpath( '//*[@id="baidu_image_holder"]/img/@src')
  51. img_url += img_url2
  52. try:
  53. url = 'http:' + str(img_url[ 0])
  54. print( '正在下载第' + str(i) + '张图片,地址:' + url)
  55. r = requests.get(url)
  56. filename = 'image\\{}\\'.format(fileName) + str(i) + '.jpg'
  57. with open(filename, 'wb') as fo:
  58. fo.write(r.content)
  59. except Exception:
  60. print( '出错了!')
  61. except Exception:
  62. print( '出错啦!')
  63. if __name__ == '__main__':
  64. get_main_url()

Python爬虫:爬取v2ex数据用csv保存


  
  1. import csv, requests, re
  2. from bs4 import BeautifulSoup
  3. url = 'https://www.v2ex.com/?tab=all'
  4. html = requests.get(url).text
  5. soup = BeautifulSoup(html, 'html.parser')
  6. articles = []
  7. for article in soup.find_all(class_= 'cell item'):
  8. title = article.find(class_= 'item_title').get_text()
  9. category = article.find(class_= 'node').get_text()
  10. author = re.findall( r'(?<=<a href="/member/).+(?="><img)', str(article))[ 0]
  11. u = article.select( '.item_title > a')
  12. link = 'https://www.v2ex.com' + re.findall( r'(?<=href=").+(?=")', str(u))[ 0]
  13. articles.append([title, category, author, link])
  14. with open( r'document\v2ex.csv', 'w') as f:
  15. writer = csv.writer(f)
  16. writer.writerow([ '文章标题', '分类', '作者', '文章地址'])
  17. for row in articles:
  18. writer.writerow(row)

Python爬虫:豌豆荚设计奖三种爬取方法速度对比


  
  1. __author__ = '布咯咯_rieuse'
  2. import asyncio
  3. import random
  4. import time
  5. import aiohttp
  6. import pymongo
  7. import requests
  8. import multiprocessing
  9. from bs4 import BeautifulSoup
  10. # 共用部分
  11. clients = pymongo.MongoClient( 'localhost')
  12. db = clients[ "wandoujia"]
  13. col = db[ "info"]
  14. urls = [ 'http://www.wandoujia.com/award?page={}'.format(num) for num in range( 1, 46)]
  15. UA_LIST = [
  16. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  17. "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  18. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  19. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  20. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  21. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  22. "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  23. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  24. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  25. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
  26. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  27. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  28. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  29. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  30. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
  31. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  32. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  33. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  34. "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  35. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  36. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  37. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  38. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  39. "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  40. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  41. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  42. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  43. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  44. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
  45. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  46. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  47. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
  48. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
  49. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
  50. "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
  51. ]
  52. headers = {
  53. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  54. 'Accept-Encoding': 'gzip, deflate, sdch',
  55. 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
  56. 'Connection': 'keep-alive',
  57. 'Host': 'www.wandoujia.com',
  58. 'User-Agent': random.choice(UA_LIST)
  59. }
  60. proxies = {
  61. 'http': 'http://123.206.6.17:3128',
  62. 'https': 'http://123.206.6.17:3128'
  63. }
  64. # 方式一:使用常见的requests
  65. def method_1():
  66. start = time.time()
  67. for url in urls:
  68. html = requests.get(url, headers=headers, proxies=proxies).text
  69. soup = BeautifulSoup(html, 'lxml')
  70. title = soup.find_all(class_= 'title')
  71. app_title = soup.find_all(class_= 'app-title')
  72. item_cover = soup.find_all(class_= 'item-cover')
  73. icon_cover = soup.select( 'div.list-wrap > ul > li > div.icon > img')
  74. for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
  75. content = {
  76. 'title': title_i.get_text(),
  77. 'app_title': app_title_i.get_text(),
  78. 'item_cover': item_cover_i[ 'data-original'],
  79. 'icon_cover': icon_cover_i[ 'data-original']
  80. }
  81. col.insert(content)
  82. print( '成功插入一组数据' + str(content))
  83. print( '一共用时:' + str(time.time() - start))
  84. # if __name__ == '__main__':
  85. # method_1()
  86. # 方式二:使用Requests + Pool
  87. def method_2(url):
  88. html = requests.get(url, headers=headers, proxies=proxies).text
  89. soup = BeautifulSoup(html, 'lxml')
  90. title = soup.find_all(class_= 'title')
  91. app_title = soup.find_all(class_= 'app-title')
  92. item_cover = soup.find_all(class_= 'item-cover')
  93. icon_cover = soup.select( 'div.list-wrap > ul > li > div.icon > img')
  94. for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
  95. content = {
  96. 'title': title_i.get_text(),
  97. 'app_title': app_title_i.get_text(),
  98. 'item_cover': item_cover_i[ 'data-original'],
  99. 'icon_cover': icon_cover_i[ 'data-original']
  100. }
  101. # time.sleep(1)
  102. col.insert(content)
  103. print( '成功插入一组数据' + str(content))
  104. # if __name__ == '__main__':
  105. # start = time.time()
  106. # pool = multiprocessing.Pool(4)
  107. # pool.map(method_2, urls)
  108. # pool.close()
  109. # pool.join()
  110. # print('一共用时:' + str(time.time() - start))
  111. # 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
  112. def method_3():
  113. async def get_url(url):
  114. async with aiohttp.ClientSession() as session: # async关键字将一个函数声明为协程函数,函数执行时返回一个协程对象。
  115. async with session.get(url) as html:
  116. response = await html.text(encoding= "utf-8") # await关键字将暂停协程函数的执行,等待异步IO返回结果。
  117. return response
  118. async def parser(url):
  119. html = await get_url(url)
  120. soup = BeautifulSoup(html, 'lxml')
  121. title = soup.find_all(class_= 'title')
  122. app_title = soup.find_all(class_= 'app-title')
  123. item_cover = soup.find_all(class_= 'item-cover')
  124. icon_cover = soup.select( 'div.list-wrap > ul > li > div.icon > img')
  125. for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
  126. content = {
  127. 'title': title_i.get_text(),
  128. 'app_title': app_title_i.get_text(),
  129. 'item_cover': item_cover_i[ 'data-original'],
  130. 'icon_cover': icon_cover_i[ 'data-original']
  131. }
  132. col.insert(content)
  133. print( '成功插入一组数据' + str(content))
  134. start = time.time()
  135. loop = asyncio.get_event_loop()
  136. tasks = [parser(url) for url in urls]
  137. loop.run_until_complete(asyncio.gather(*tasks))
  138. print(time.time() - start)
  139. if __name__ == '__main__':
  140. method_3()
  141. © 2021 GitHub, Inc.

Python爬虫:使用lxml解析HTML,输出对应值


  
  1. import requests
  2. import lxml.html
  3. url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml'
  4. html = requests.get(url).text
  5. doc = lxml.html.fromstring(html)
  6. titles = doc.xpath( '//div[@class="newsList"]/ul/li/a/text()')
  7. href = doc.xpath( '//div[@class="newsList"]/ul/li/a/@href')
  8. i = 0
  9. for content in titles:
  10. results = {
  11. '标题': titles[i],
  12. '链接': href[i]
  13. }
  14. i += 1
  15. print(results)

Python爬虫:使用Selenium爬取一点资讯动态数据


  
  1. from selenium.webdriver.common.keys import Keys
  2. from selenium import webdriver
  3. from bs4 import BeautifulSoup
  4. import csv
  5. driver = webdriver.Firefox()
  6. driver.implicitly_wait( 3)
  7. first_url = 'http://www.yidianzixun.com/channel/c6'
  8. driver.get(first_url)
  9. driver.find_element_by_class_name( 'icon-refresh').click()
  10. for i in range( 1, 90):
  11. driver.find_element_by_class_name( 'icon-refresh').send_keys(Keys.DOWN)
  12. soup = BeautifulSoup(driver.page_source, 'lxml')
  13. articles = []
  14. for article in soup.find_all(class_= 'item doc style-small-image style-content-middle'):
  15. title = article.find(class_= 'doc-title').get_text()
  16. source = article.find(class_= 'source').get_text()
  17. comment = article.find(class_= 'comment-count').get_text()
  18. link = 'http://www.yidianzixun.com' + article.get( 'href')
  19. articles.append([title, source, comment, link])
  20. driver.quit()
  21. with open( r'document\yidian.csv', 'w') as f:
  22. writer = csv.writer(f)
  23. writer.writerow([ '文章标题', '作者', '评论数', '文章地址'])
  24. for row in articles:
  25. writer.writerow(row)

Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb


  
  1. from selenium.common.exceptions import TimeoutException
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.support import expected_conditions as EC
  4. from selenium.webdriver.support.ui import WebDriverWait
  5. from selenium import webdriver
  6. from bs4 import BeautifulSoup
  7. import lxml.html
  8. import pymongo
  9. import re
  10. MONGO_URL = 'localhost'
  11. MONGO_DB = 'amazon'
  12. MONGO_TABLE = 'amazon-python'
  13. SERVICE_ARGS = [ '--load-images=false', '--disk-cache=true']
  14. KEYWORD = 'python'
  15. client = pymongo.MongoClient(MONGO_URL)
  16. db = client[MONGO_DB]
  17. browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
  18. # browser = webdriver.Firefox()
  19. wait = WebDriverWait(browser, 10)
  20. browser.set_window_size( 1400, 900)
  21. def search():
  22. print( '正在搜索')
  23. try:
  24. browser.get( 'https://www.amazon.cn/')
  25. input = wait.until(
  26. EC.presence_of_element_located((By.CSS_SELECTOR, '#twotabsearchtextbox'))
  27. )
  28. submit = wait.until(
  29. EC.element_to_be_clickable((By.CSS_SELECTOR, '#nav-search > form > div.nav-right > div > input')))
  30. input.send_keys(KEYWORD)
  31. submit.click()
  32. total = wait.until(
  33. EC.presence_of_element_located((By.CSS_SELECTOR, '#pagn > span.pagnDisabled')))
  34. get_products()
  35. print( '一共' + total.text + '页')
  36. return total.text
  37. except TimeoutException:
  38. return search()
  39. def next_page(number):
  40. print( '正在翻页', number)
  41. try:
  42. wait.until(EC.text_to_be_present_in_element(
  43. (By.CSS_SELECTOR, '#pagnNextString'), '下一页'))
  44. submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pagnNextString')))
  45. submit.click()
  46. wait.until(EC.text_to_be_present_in_element(
  47. (By.CSS_SELECTOR, '.pagnCur'), str(number)))
  48. get_products()
  49. except TimeoutException:
  50. next_page(number)
  51. def get_products():
  52. try:
  53. wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#s-results-list-atf')))
  54. html = browser.page_source
  55. soup = BeautifulSoup(html, 'lxml')
  56. doc = lxml.html.fromstring(html)
  57. date = doc.xpath( '//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()')
  58. content = soup.find_all(attrs={ "id": re.compile( r'result_\d+')})
  59. for item, time in zip(content, date):
  60. product = {
  61. 'title': item.find(class_= 's-access-title').get_text(),
  62. 'image': item.find(class_= 's-access-image cfMarker').get( 'src'),
  63. 'price': item.find(class_= 'a-size-base a-color-price s-price a-text-bold').get_text(),
  64. 'date': time
  65. }
  66. # save_to_mongo(product)
  67. print(product)
  68. except Exception as e:
  69. print(e)
  70. def save_to_mongo(result):
  71. try:
  72. if db[MONGO_TABLE].insert(result):
  73. print( '存储到mongodb成功', result)
  74. except Exception:
  75. print( '存储到mongodb失败', result)
  76. def main():
  77. try:
  78. total = int(search())
  79. for i in range( 2, total + 1):
  80. next_page(i)
  81. except Exception as e:
  82. print( '出错啦', e)
  83. finally:
  84. browser.close()
  85. if __name__ == '__main__':
  86. main()

Python爬虫:获取黑大验证码并登录


  
  1. import requests
  2. from PIL import Image
  3. from bs4 import BeautifulSoup
  4. url1 = 'http://my.hlju.edu.cn/captchaGenerate.portal?'
  5. url2 = 'http://my.hlju.edu.cn/userPasswordValidate.portal'
  6. url3 = 'http://my.hlju.edu.cn/index.portal'
  7. headers = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
  9. }
  10. s = requests.session()
  11. response = s.get(url1, headers=headers)
  12. html = response.text
  13. soup = BeautifulSoup(html, 'html.parser')
  14. with open( 'img\code.jpg', 'wb') as f:
  15. f.write(response.content)
  16. img = Image.open( 'img\code.jpg')
  17. img.show()
  18. data = {}
  19. data[ 'Login.Token1'] = '20154433'
  20. data[ 'Login.Token2'] = ''
  21. data[ 'captcha'] = input( '输入验证码:')
  22. data[ 'goto'] = 'http://my.hlju.edu.cn/loginSuccess.portal'
  23. data[ 'gotoOnFail'] = 'http://my.hlju.edu.cn/loginFailure.portal'
  24. response2 = s.post(url=url2, data=data, headers=headers)
  25. response3 = s.get(url3, headers=headers)
  26. print(response3.text)


转载:https://blog.csdn.net/ChengYin1124/article/details/116944941
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场