京东商品的信息是比较好爬取的,思路如下:
1.因为京东商品是有JavaScript渲染的,所以可以用selenium库来获取商品的源代码
2.获取了商品的源代码后,用正则表达式库(re)和著名的“美丽的汤”(BeautifulSoup)库来解析所需要的商品的属性,比如商品名称、价格、评价数
3.把解析后的信息保存到csv文件,用pandas库
具体的代码如下:
#爬取京东手机信息
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import pandas as pd
options = webdriver.ChromeOptions()
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
#停止加载图片
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
driver = webdriver.Chrome(options=options)
driver.maximize_window()
def get_detil(url):
detil_list=[]
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': '__jdu=1503776177; shshshfpa=fc731dcb-bbe0-5ef8-a758-feb8361f1279-1558793417; shshshfpb=jXoIibDZ2Cg1j2c7AzOnLpQ%3D%3D; unpl=V2_ZzNtbUEFRhV1Wk8Dch1ZAGIAFl0RAxcWc1gTVi5OXQVnBhFdclRCFX0URlVnGlgUZAEZXkpcQBNFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsdXQdhBRVdRFFzJXI4dmR%2fH1wBZwQiXHJWc1chVERScxldSGcHE19EUUQVcw52VUsa; user-key=72ca3dde-5dc8-457b-a8cc-732a9ad2944e; cn=0; PCSYCityID=CN_500000_500100_0; areaId=4; ipLoc-djd=4-113-9786-0; __jdv=122270672|baidu|-|organic|%25E7%2588%25AC%25E5%258F%2596%25E4%25BA%25AC%25E4%25B8%259C%25E5%2595%2586%25E5%2593%2581%25E4%25BF%25A1%25E6%2581%25AF|1573982428458; mt_xid=V2_52007VwMWU19eVF0fTx9sV28ARwcJWFBGSxlJVRliAhtWQVAAD09VSVQMZwUVW11RBlsYeRpdBW8fElJBW1NLHksSXAZsAhdiX2hSahZKGlQCbwUWU21YVF4b; shshshfp=502935447455162f98afcb9bb4fbd4fc; shshshsID=cb05e3feb787fd0b6a52a993c47979a2_12_1574193435659; __jda=122270672.1503776177.1546445332.1574186022.1574190631.44; __jdb=122270672.12.1503776177|44.1574190631; __jdc=122270672; 3AB9D23F7A4B3C9B=F3DVZOZZIY4HWG2IDIZMN2EKWAM7OPZYR7EZBWT5HFZGUUV7UQTHSXYEW6A55TWAQVO3KOVJP7G64CNVJK4ABS4GCQ'
}
r=requests.get(url,headers=headers)
soup=BeautifulSoup(r.text,'html.parser')
all_detil=soup.find_all('ul',class_="parameter2 p-parameter-list")
good_weight=re.findall('(?<=商品毛重:)(.+?)(?=</li>)',str(all_detil))
good_cpu=re.findall('(?<=CPU型号:)(.+?)(?=</li>)',str(all_detil))
good_yunhnc=re.findall('(?<=运行内存:)(.+?)(?=</li>)',str(all_detil))
good_jscc=re.findall('(?<=机身存储:)(.+?)(?=</li>)',str(all_detil))
good_cck=re.findall('(?<=存储卡:)(.+?)(?=</li>)',str(all_detil))
good_hzsxt=re.findall('(?<=后摄主摄像素:)(.+?)(?=</li>)',str(all_detil))
good_qzsxt=re.findall('(?<=前摄主摄像素:)(.+?)(?=</li>)',str(all_detil))
good_dcrl=re.findall('(?<=电池容量(mAh):)(.+?)(?=</li>)',str(all_detil))
good_ccxt=re.findall('(?<=操作系统:)(.+?)(?=</li>)',str(all_detil))
good_jsys=re.findall('(?<=机身颜色:)(.+?)(?=</li>)',str(all_detil))
detil_list.append(good_weight)
detil_list.append(good_cpu)
detil_list.append(good_yunhnc)
detil_list.append(good_jscc)
detil_list.append(good_cck)
detil_list.append(good_hzsxt)
detil_list.append(good_qzsxt)
detil_list.append(good_dcrl)
detil_list.append(good_ccxt)
detil_list.append(good_jsys)
return detil_list
for i in range(43,53):
#print ("正在访问{}".format(url))
url = 'https://list.jd.com/list.html?cat=9987,653,655&page={}&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=10#J_main'.format(str(i))
driver.get(url)
data=[]
data = driver.page_source
soup1=[]
soup1 = BeautifulSoup(data, 'html.parser')
all_list=[]
all_list=soup1.find_all('li',class_="gl-item")
div_name=[]
div_price=[]
div_ljpj=[]
div_inurl=[]
name=[]
price=[]
ljpj=[]
inurl=[]
indetil=[]
d1=[]
d2=[]
d3=[]
d4=[]
d5=[]
d6=[]
d7=[]
d8=[]
d9=[]
d10=[]
for good in all_list:
div_name.append(good.find_all('div',class_="p-name"))
div_price.append(good.find_all('div',class_="p-price"))
div_ljpj.append(good.find_all('div',class_="p-commit"))
div_inurl.append(good.find_all('div',class_="p-img"))
soup_name=BeautifulSoup(str(div_name),'html.parser')
# soup_price=BeautifulSoup(str(div_price),'html.parser')
soup_ljpj=BeautifulSoup(str(div_ljpj),'html.parser')
soup_inurl=BeautifulSoup(str(div_inurl),'html.parser')
all_name=soup_name.find_all('em')
# all_price=soup_price.find_all('i')
all_ljpj=soup_ljpj.find_all('a',class_="comment")
all_inurl=soup_inurl.find_all('a')
for names in all_name:
name.append(names.get_text().strip())
for ljpjs in all_ljpj:
ljpj.append(ljpjs.get_text().strip())
for prices in div_price:
p=re.findall('[1-9][0-9]{1,}',str(prices))
if p==[]:
price.append([])
else:
price.append(p[0])
for inurls in all_inurl:
inurl.append(inurls['href'])
for link in inurl:
if link[0:7]=='http://' or link[0:8]=='https://':
indetil=get_detil(link)
else:
indetil=get_detil('https:'+link)
d1.append(indetil[0])
d2.append(indetil[1])
d3.append(indetil[2])
d4.append(indetil[3])
d5.append(indetil[4])
d6.append(indetil[5])
d7.append(indetil[6])
d8.append(indetil[7])
d9.append(indetil[8])
d10.append(indetil[9])
dataframe = pd.DataFrame({'name':name,'price':price,'ljpj':ljpj,'d1':d1,'d2':d2,'d3':d3,'d4':d4,'d5':d5,'d6':d6,'d7':d7,'d8':d8,'9':d9,'10':d10})
#将DataFrame存储为csv,index表示是否显示行名,default=True
dataframe.to_csv("D:\\新桌面\\1.csv",index=False,sep=',',mode='a',header=False,encoding="gbk")
print('第',i,'页爬取完成')
转载:https://blog.csdn.net/Dig_DD/article/details/106165589
查看评论