目录
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
Python爬虫:爬取豆瓣电影中速度与激情8演员图片
-
import urllib.request
-
import os
-
import re
-
-
-
def douban(url):
-
r = urllib.request.urlopen(url)
-
html = r.read().decode(
'utf-8')
-
result = re.findall(
r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
-
result2 = re.findall(
r'(?<=title=").\S+', html)
-
result2.pop()
-
result3 = sorted(set(result2), key=result2.index)
-
result3.pop(
-3)
-
if
not os.path.exists(
'douban'):
-
os.makedirs(
'douban')
-
i =
0
-
for link
in result:
-
filename =
'douban\\' + str(result3[i]) +
'.jpg'
-
i +=
1
-
with open(filename,
'w')
as file:
-
urllib.request.urlretrieve(link, filename)
-
-
-
url =
'https://movie.douban.com/subject/26260853/celebrities'
-
if __name__ ==
'__main__':
-
douban(url)
Python爬虫:斗鱼弹幕相关信息保存到mongodb
-
# 这个抓取弹幕,然后把用户的uid,昵称,等级,弹幕内容都保存到mongodb中
-
__author__ =
'布咯咯_rieuse'
-
__time__ =
'2017.6.2'
-
__github__ =
'https://github.com/rieuse'
-
-
import multiprocessing
-
import re
-
import socket
-
import time
-
-
import pymongo
-
import requests
-
from bs4
import BeautifulSoup
-
-
clients = pymongo.MongoClient(
'localhost')
-
db = clients[
"DouyuTV_danmu"]
-
col = db[
"info"]
-
-
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-
host = socket.gethostbyname(
"openbarrage.douyutv.com")
-
port =
8601
-
client.connect((host, port))
-
-
danmu_path = re.compile(
b'txt@=(.+?)/cid@')
-
uid_path = re.compile(
b'uid@=(.+?)/nn@')
-
nickname_path = re.compile(
b'nn@=(.+?)/txt@')
-
level_path = re.compile(
b'level@=([1-9][0-9]?)/sahf')
-
-
-
def sendmsg(msgstr):
-
msg = msgstr.encode(
'utf-8')
-
data_length = len(msg) +
8
-
code =
689
-
msgHead = int.to_bytes(data_length,
4,
'little') \
-
+ int.to_bytes(data_length,
4,
'little') + int.to_bytes(code,
4,
'little')
-
client.send(msgHead)
-
sent =
0
-
while sent < len(msg):
-
tn = client.send(msg[sent:])
-
sent = sent + tn
-
-
-
def start(roomid):
-
msg =
'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid)
-
sendmsg(msg)
-
msg_more =
'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
-
sendmsg(msg_more)
-
-
print(
'---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
-
while
True:
-
data = client.recv(
1024)
-
uid_more = uid_path.findall(data)
-
nickname_more = nickname_path.findall(data)
-
level_more = level_path.findall(data)
-
danmu_more = danmu_path.findall(data)
-
if
not level_more:
-
level_more =
b'0'
-
if
not data:
-
break
-
else:
-
for i
in range(
0, len(danmu_more)):
-
try:
-
product = {
-
'uid': uid_more[
0].decode(encoding=
'utf-8'),
-
'nickname': nickname_more[
0].decode(encoding=
'utf-8'),
-
'level': level_more[
0].decode(encoding=
'utf-8'),
-
'danmu': danmu_more[
0].decode(encoding=
'utf-8')
-
}
-
print(product)
-
col.insert(product)
-
print(
'成功导入mongodb')
-
except Exception
as e:
-
print(e)
-
-
-
def keeplive():
-
while
True:
-
msg =
'type@=keeplive/tick@=' + str(int(time.time())) +
'/\0'
-
sendmsg(msg)
-
time.sleep(
15)
-
-
-
def get_name(roomid):
-
r = requests.get(
"http://www.douyu.com/" + roomid)
-
soup = BeautifulSoup(r.text,
'lxml')
-
return soup.find(
'a', {
'class',
'zb-name'}).string
-
-
-
if __name__ ==
'__main__':
-
room_id = input(
'请出入房间ID: ')
-
p1 = multiprocessing.Process(target=start, args=(room_id,))
-
p2 = multiprocessing.Process(target=keeplive)
-
p1.start()
-
p2.start()
Python爬虫:抓取喜马拉雅电台音频
-
_author__ =
'布咯咯_rieuse'
-
-
import json
-
import random
-
import time
-
import pymongo
-
import requests
-
from bs4
import BeautifulSoup
-
from lxml
import etree
-
-
clients = pymongo.MongoClient(
'localhost')
-
db = clients[
"XiMaLaYa"]
-
col1 = db[
"album2"]
-
col2 = db[
"detaile2"]
-
-
UA_LIST = [
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
-
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
-
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
-
]
-
headers1 = {
-
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-
'Accept-Encoding':
'gzip, deflate, sdch',
-
'Accept-Language':
'zh-CN,zh;q=0.8,en;q=0.6',
-
'Cache-Control':
'max-age=0',
-
'Proxy-Connection':
'keep-alive',
-
'Upgrade-Insecure-Requests':
'1',
-
'User-Agent': random.choice(UA_LIST)
-
}
-
headers2 = {
-
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-
'Accept-Encoding':
'gzip, deflate, sdch',
-
'Accept-Language':
'zh-CN,zh;q=0.8,en;q=0.6',
-
'Cache-Control':
'max-age=0',
-
'Proxy-Connection':
'keep-alive',
-
'Referer':
'http://www.ximalaya.com/dq/all/2',
-
'Upgrade-Insecure-Requests':
'1',
-
'User-Agent': random.choice(UA_LIST)
-
}
-
-
-
def get_url():
-
start_urls = [
'http://www.ximalaya.com/dq/all/{}'.format(num)
for num
in range(
1,
85)]
-
for start_url
in start_urls:
-
html = requests.get(start_url, headers=headers1).text
-
soup = BeautifulSoup(html,
'lxml')
-
for item
in soup.find_all(class_=
"albumfaceOutter"):
-
content = {
-
'href': item.a[
'href'],
-
'title': item.img[
'alt'],
-
'img_url': item.img[
'src']
-
}
-
col1.insert(content)
-
print(
'写入一个频道' + item.a[
'href'])
-
print(content)
-
another(item.a[
'href'])
-
time.sleep(
1)
-
-
-
def another(url):
-
html = requests.get(url, headers=headers2).text
-
ifanother = etree.HTML(html).xpath(
'//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
-
if len(ifanother):
-
num = ifanother[
0]
-
print(
'本频道资源存在' + num +
'个页面')
-
for n
in range(
1, int(num)):
-
print(
'开始解析{}个中的第{}个页面'.format(num, n))
-
url2 = url +
'?page={}'.format(n)
-
get_m4a(url2)
-
get_m4a(url)
-
-
-
def get_m4a(url):
-
time.sleep(
1)
-
html = requests.get(url, headers=headers2).text
-
numlist = etree.HTML(html).xpath(
'//div[@class="personal_body"]/@sound_ids')[
0].split(
',')
-
for i
in numlist:
-
murl =
'http://www.ximalaya.com/tracks/{}.json'.format(i)
-
html = requests.get(murl, headers=headers1).text
-
dic = json.loads(html)
-
col2.insert(dic)
-
print(murl +
'中的数据已被成功插入mongodb')
-
-
-
if __name__ ==
'__main__':
-
get_url()
Python爬虫—抓包分析爬取实习僧全部招聘信息
-
import json
-
import requests
-
import pymongo
-
import time
-
-
clients = pymongo.MongoClient(
'localhost')
-
db = clients[
"Shixiseng"]
-
col = db[
"detail_info"]
-
-
urls = [
'http://www.shixiseng.com/app/internsvt?c=%E5%85%A8%E5%9B%BD&p={}&t=hot'.format(n)
for n
in range(
1,
3487)]
-
for url
in urls:
-
print(url)
-
r = requests.get(url)
-
html = r.content.decode(
'utf-8')
-
content = json.loads(html)[
'msg'][
'b']
-
for i
in content:
-
print(
'插入一条数据:')
-
print(i)
-
col.insert(i)
-
time.sleep(
0.01)
Python爬虫:批量抓取花瓣网高清美图并保存
-
__author__ =
'布咯咯_rieuse'
-
-
import os
-
import lxml.html
-
import requests
-
from selenium
import webdriver
-
from selenium.webdriver.common.by
import By
-
from selenium.webdriver.support
import expected_conditions
as EC
-
from selenium.webdriver.support.ui
import WebDriverWait
-
-
SERVICE_ARGS = [
'--load-images=false',
'--disk-cache=true']
-
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
-
# browser = webdriver.Firefox()
-
wait = WebDriverWait(browser,
5)
-
browser.set_window_size(
1400,
900)
-
-
-
def parser(url, param):
-
# 解析模块
-
browser.get(url)
-
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param)))
-
html = browser.page_source
-
doc = lxml.html.fromstring(html)
-
return doc
-
-
-
def get_main_url():
-
print(
'打开主页搜寻链接中...')
-
try:
-
doc = parser(
'http://huaban.com/boards/favorite/beauty/',
'#waterfall')
-
name = doc.xpath(
'//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
-
u = doc.xpath(
'//*[@id="waterfall"]/div/a[1]/@href')
-
for item, fileName
in zip(u, name):
-
main_url =
'http://huaban.com' + item
-
print(
'主链接已找到' + main_url)
-
if
'*'
in fileName:
-
fileName = fileName.replace(
'*',
'')
-
download(main_url, fileName)
-
except Exception
as e:
-
print(e)
-
-
-
def download(main_url, fileName):
-
print(
'-------准备下载中-------')
-
try:
-
doc = parser(main_url,
'#waterfall')
-
if
not os.path.exists(
'image\\' + fileName):
-
print(
'创建文件夹...')
-
os.makedirs(
'image\\' + fileName)
-
link = doc.xpath(
'//*[@id="waterfall"]/div/a/@href')
-
# print(link)
-
i =
0
-
for item
in link:
-
i +=
1
-
minor_url =
'http://huaban.com' + item
-
doc = parser(minor_url,
'#pin_view_page')
-
img_url = doc.xpath(
'//*[@id="baidu_image_holder"]/a/img/@src')
-
img_url2 = doc.xpath(
'//*[@id="baidu_image_holder"]/img/@src')
-
img_url += img_url2
-
try:
-
url =
'http:' + str(img_url[
0])
-
print(
'正在下载第' + str(i) +
'张图片,地址:' + url)
-
r = requests.get(url)
-
filename =
'image\\{}\\'.format(fileName) + str(i) +
'.jpg'
-
with open(filename,
'wb')
as fo:
-
fo.write(r.content)
-
except Exception:
-
print(
'出错了!')
-
except Exception:
-
print(
'出错啦!')
-
-
-
if __name__ ==
'__main__':
-
get_main_url()
Python爬虫:爬取v2ex数据用csv保存
-
import csv, requests, re
-
from bs4
import BeautifulSoup
-
-
url =
'https://www.v2ex.com/?tab=all'
-
html = requests.get(url).text
-
soup = BeautifulSoup(html,
'html.parser')
-
articles = []
-
for article
in soup.find_all(class_=
'cell item'):
-
title = article.find(class_=
'item_title').get_text()
-
category = article.find(class_=
'node').get_text()
-
author = re.findall(
r'(?<=<a href="/member/).+(?="><img)', str(article))[
0]
-
u = article.select(
'.item_title > a')
-
link =
'https://www.v2ex.com' + re.findall(
r'(?<=href=").+(?=")', str(u))[
0]
-
articles.append([title, category, author, link])
-
-
with open(
r'document\v2ex.csv',
'w')
as f:
-
writer = csv.writer(f)
-
writer.writerow([
'文章标题',
'分类',
'作者',
'文章地址'])
-
for row
in articles:
-
writer.writerow(row)
Python爬虫:豌豆荚设计奖三种爬取方法速度对比
-
__author__ =
'布咯咯_rieuse'
-
-
import asyncio
-
import random
-
import time
-
import aiohttp
-
import pymongo
-
import requests
-
import multiprocessing
-
from bs4
import BeautifulSoup
-
-
# 共用部分
-
clients = pymongo.MongoClient(
'localhost')
-
db = clients[
"wandoujia"]
-
col = db[
"info"]
-
-
urls = [
'http://www.wandoujia.com/award?page={}'.format(num)
for num
in range(
1,
46)]
-
UA_LIST = [
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
-
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
-
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
-
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
-
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
-
]
-
headers = {
-
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-
'Accept-Encoding':
'gzip, deflate, sdch',
-
'Accept-Language':
'zh-CN,zh;q=0.8,en;q=0.6',
-
'Connection':
'keep-alive',
-
'Host':
'www.wandoujia.com',
-
'User-Agent': random.choice(UA_LIST)
-
}
-
-
proxies = {
-
'http':
'http://123.206.6.17:3128',
-
'https':
'http://123.206.6.17:3128'
-
}
-
-
-
# 方式一:使用常见的requests
-
def method_1():
-
start = time.time()
-
for url
in urls:
-
html = requests.get(url, headers=headers, proxies=proxies).text
-
soup = BeautifulSoup(html,
'lxml')
-
title = soup.find_all(class_=
'title')
-
app_title = soup.find_all(class_=
'app-title')
-
item_cover = soup.find_all(class_=
'item-cover')
-
icon_cover = soup.select(
'div.list-wrap > ul > li > div.icon > img')
-
for title_i, app_title_i, item_cover_i, icon_cover_i
in zip(title, app_title, item_cover, icon_cover):
-
content = {
-
'title': title_i.get_text(),
-
'app_title': app_title_i.get_text(),
-
'item_cover': item_cover_i[
'data-original'],
-
'icon_cover': icon_cover_i[
'data-original']
-
}
-
col.insert(content)
-
print(
'成功插入一组数据' + str(content))
-
print(
'一共用时:' + str(time.time() - start))
-
-
-
# if __name__ == '__main__':
-
# method_1()
-
-
-
-
-
-
# 方式二:使用Requests + Pool
-
def method_2(url):
-
html = requests.get(url, headers=headers, proxies=proxies).text
-
soup = BeautifulSoup(html,
'lxml')
-
title = soup.find_all(class_=
'title')
-
app_title = soup.find_all(class_=
'app-title')
-
item_cover = soup.find_all(class_=
'item-cover')
-
icon_cover = soup.select(
'div.list-wrap > ul > li > div.icon > img')
-
for title_i, app_title_i, item_cover_i, icon_cover_i
in zip(title, app_title, item_cover, icon_cover):
-
content = {
-
'title': title_i.get_text(),
-
'app_title': app_title_i.get_text(),
-
'item_cover': item_cover_i[
'data-original'],
-
'icon_cover': icon_cover_i[
'data-original']
-
}
-
# time.sleep(1)
-
col.insert(content)
-
print(
'成功插入一组数据' + str(content))
-
-
-
# if __name__ == '__main__':
-
# start = time.time()
-
# pool = multiprocessing.Pool(4)
-
# pool.map(method_2, urls)
-
# pool.close()
-
# pool.join()
-
# print('一共用时:' + str(time.time() - start))
-
-
-
# 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
-
-
def method_3():
-
async
def get_url(url):
-
async
with aiohttp.ClientSession()
as session:
# async关键字将一个函数声明为协程函数,函数执行时返回一个协程对象。
-
async
with session.get(url)
as html:
-
response =
await html.text(encoding=
"utf-8")
# await关键字将暂停协程函数的执行,等待异步IO返回结果。
-
return response
-
-
async
def parser(url):
-
html =
await get_url(url)
-
soup = BeautifulSoup(html,
'lxml')
-
title = soup.find_all(class_=
'title')
-
app_title = soup.find_all(class_=
'app-title')
-
item_cover = soup.find_all(class_=
'item-cover')
-
icon_cover = soup.select(
'div.list-wrap > ul > li > div.icon > img')
-
for title_i, app_title_i, item_cover_i, icon_cover_i
in zip(title, app_title, item_cover, icon_cover):
-
content = {
-
'title': title_i.get_text(),
-
'app_title': app_title_i.get_text(),
-
'item_cover': item_cover_i[
'data-original'],
-
'icon_cover': icon_cover_i[
'data-original']
-
}
-
col.insert(content)
-
print(
'成功插入一组数据' + str(content))
-
-
start = time.time()
-
loop = asyncio.get_event_loop()
-
tasks = [parser(url)
for url
in urls]
-
loop.run_until_complete(asyncio.gather(*tasks))
-
print(time.time() - start)
-
-
if __name__ ==
'__main__':
-
method_3()
-
©
2021 GitHub, Inc.
Python爬虫:使用lxml解析HTML,输出对应值
-
import requests
-
import lxml.html
-
-
url =
'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml'
-
html = requests.get(url).text
-
doc = lxml.html.fromstring(html)
-
titles = doc.xpath(
'//div[@class="newsList"]/ul/li/a/text()')
-
href = doc.xpath(
'//div[@class="newsList"]/ul/li/a/@href')
-
i =
0
-
for content
in titles:
-
results = {
-
'标题': titles[i],
-
'链接': href[i]
-
}
-
i +=
1
-
print(results)
Python爬虫:使用Selenium爬取一点资讯动态数据
-
from selenium.webdriver.common.keys
import Keys
-
from selenium
import webdriver
-
from bs4
import BeautifulSoup
-
import csv
-
-
driver = webdriver.Firefox()
-
driver.implicitly_wait(
3)
-
first_url =
'http://www.yidianzixun.com/channel/c6'
-
driver.get(first_url)
-
driver.find_element_by_class_name(
'icon-refresh').click()
-
for i
in range(
1,
90):
-
driver.find_element_by_class_name(
'icon-refresh').send_keys(Keys.DOWN)
-
soup = BeautifulSoup(driver.page_source,
'lxml')
-
articles = []
-
for article
in soup.find_all(class_=
'item doc style-small-image style-content-middle'):
-
title = article.find(class_=
'doc-title').get_text()
-
source = article.find(class_=
'source').get_text()
-
comment = article.find(class_=
'comment-count').get_text()
-
link =
'http://www.yidianzixun.com' + article.get(
'href')
-
articles.append([title, source, comment, link])
-
driver.quit()
-
with open(
r'document\yidian.csv',
'w')
as f:
-
writer = csv.writer(f)
-
writer.writerow([
'文章标题',
'作者',
'评论数',
'文章地址'])
-
for row
in articles:
-
writer.writerow(row)
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
-
from selenium.common.exceptions
import TimeoutException
-
from selenium.webdriver.common.by
import By
-
from selenium.webdriver.support
import expected_conditions
as EC
-
from selenium.webdriver.support.ui
import WebDriverWait
-
from selenium
import webdriver
-
from bs4
import BeautifulSoup
-
import lxml.html
-
import pymongo
-
import re
-
-
MONGO_URL =
'localhost'
-
MONGO_DB =
'amazon'
-
MONGO_TABLE =
'amazon-python'
-
SERVICE_ARGS = [
'--load-images=false',
'--disk-cache=true']
-
KEYWORD =
'python'
-
client = pymongo.MongoClient(MONGO_URL)
-
db = client[MONGO_DB]
-
-
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
-
# browser = webdriver.Firefox()
-
wait = WebDriverWait(browser,
10)
-
browser.set_window_size(
1400,
900)
-
-
-
def search():
-
print(
'正在搜索')
-
try:
-
browser.get(
'https://www.amazon.cn/')
-
input = wait.until(
-
EC.presence_of_element_located((By.CSS_SELECTOR,
'#twotabsearchtextbox'))
-
)
-
submit = wait.until(
-
EC.element_to_be_clickable((By.CSS_SELECTOR,
'#nav-search > form > div.nav-right > div > input')))
-
input.send_keys(KEYWORD)
-
submit.click()
-
total = wait.until(
-
EC.presence_of_element_located((By.CSS_SELECTOR,
'#pagn > span.pagnDisabled')))
-
get_products()
-
print(
'一共' + total.text +
'页')
-
return total.text
-
except TimeoutException:
-
return search()
-
-
-
def next_page(number):
-
print(
'正在翻页', number)
-
try:
-
wait.until(EC.text_to_be_present_in_element(
-
(By.CSS_SELECTOR,
'#pagnNextString'),
'下一页'))
-
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
'#pagnNextString')))
-
submit.click()
-
wait.until(EC.text_to_be_present_in_element(
-
(By.CSS_SELECTOR,
'.pagnCur'), str(number)))
-
get_products()
-
except TimeoutException:
-
next_page(number)
-
-
-
def get_products():
-
try:
-
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
'#s-results-list-atf')))
-
html = browser.page_source
-
soup = BeautifulSoup(html,
'lxml')
-
doc = lxml.html.fromstring(html)
-
date = doc.xpath(
'//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()')
-
content = soup.find_all(attrs={
"id": re.compile(
r'result_\d+')})
-
for item, time
in zip(content, date):
-
product = {
-
'title': item.find(class_=
's-access-title').get_text(),
-
'image': item.find(class_=
's-access-image cfMarker').get(
'src'),
-
'price': item.find(class_=
'a-size-base a-color-price s-price a-text-bold').get_text(),
-
'date': time
-
}
-
# save_to_mongo(product)
-
print(product)
-
except Exception
as e:
-
print(e)
-
-
-
def save_to_mongo(result):
-
try:
-
if db[MONGO_TABLE].insert(result):
-
print(
'存储到mongodb成功', result)
-
except Exception:
-
print(
'存储到mongodb失败', result)
-
-
-
def main():
-
try:
-
total = int(search())
-
for i
in range(
2, total +
1):
-
next_page(i)
-
except Exception
as e:
-
print(
'出错啦', e)
-
finally:
-
browser.close()
-
-
-
if __name__ ==
'__main__':
-
main()
Python爬虫:获取黑大验证码并登录
-
import requests
-
from PIL
import Image
-
from bs4
import BeautifulSoup
-
-
url1 =
'http://my.hlju.edu.cn/captchaGenerate.portal?'
-
url2 =
'http://my.hlju.edu.cn/userPasswordValidate.portal'
-
url3 =
'http://my.hlju.edu.cn/index.portal'
-
headers = {
-
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
-
}
-
s = requests.session()
-
response = s.get(url1, headers=headers)
-
html = response.text
-
soup = BeautifulSoup(html,
'html.parser')
-
with open(
'img\code.jpg',
'wb')
as f:
-
f.write(response.content)
-
img = Image.open(
'img\code.jpg')
-
img.show()
-
data = {}
-
data[
'Login.Token1'] =
'20154433'
-
data[
'Login.Token2'] =
''
-
data[
'captcha'] = input(
'输入验证码:')
-
data[
'goto'] =
'http://my.hlju.edu.cn/loginSuccess.portal'
-
data[
'gotoOnFail'] =
'http://my.hlju.edu.cn/loginFailure.portal'
-
response2 = s.post(url=url2, data=data, headers=headers)
-
response3 = s.get(url3, headers=headers)
-
print(response3.text)
转载:https://blog.csdn.net/ChengYin1124/article/details/116944941
查看评论