小言_互联网的博客

python爬虫小项目:爬取京东商品信息

436人阅读  评论(0)

京东商品的信息是比较好爬取的,思路如下:
1.因为京东商品是有JavaScript渲染的,所以可以用selenium库来获取商品的源代码
2.获取了商品的源代码后,用正则表达式库(re)和著名的“美丽的汤”(BeautifulSoup)库来解析所需要的商品的属性,比如商品名称、价格、评价数
3.把解析后的信息保存到csv文件,用pandas库

具体的代码如下:

#爬取京东手机信息
import requests
from bs4 import BeautifulSoup 
from selenium import webdriver
import re
import pandas as pd
options = webdriver.ChromeOptions()
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
#停止加载图片
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
driver = webdriver.Chrome(options=options)
driver.maximize_window()

def get_detil(url):
    detil_list=[]
    headers={
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
                'cookie': '__jdu=1503776177; shshshfpa=fc731dcb-bbe0-5ef8-a758-feb8361f1279-1558793417; shshshfpb=jXoIibDZ2Cg1j2c7AzOnLpQ%3D%3D; unpl=V2_ZzNtbUEFRhV1Wk8Dch1ZAGIAFl0RAxcWc1gTVi5OXQVnBhFdclRCFX0URlVnGlgUZAEZXkpcQBNFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsdXQdhBRVdRFFzJXI4dmR%2fH1wBZwQiXHJWc1chVERScxldSGcHE19EUUQVcw52VUsa; user-key=72ca3dde-5dc8-457b-a8cc-732a9ad2944e; cn=0; PCSYCityID=CN_500000_500100_0; areaId=4; ipLoc-djd=4-113-9786-0; __jdv=122270672|baidu|-|organic|%25E7%2588%25AC%25E5%258F%2596%25E4%25BA%25AC%25E4%25B8%259C%25E5%2595%2586%25E5%2593%2581%25E4%25BF%25A1%25E6%2581%25AF|1573982428458; mt_xid=V2_52007VwMWU19eVF0fTx9sV28ARwcJWFBGSxlJVRliAhtWQVAAD09VSVQMZwUVW11RBlsYeRpdBW8fElJBW1NLHksSXAZsAhdiX2hSahZKGlQCbwUWU21YVF4b; shshshfp=502935447455162f98afcb9bb4fbd4fc; shshshsID=cb05e3feb787fd0b6a52a993c47979a2_12_1574193435659; __jda=122270672.1503776177.1546445332.1574186022.1574190631.44; __jdb=122270672.12.1503776177|44.1574190631; __jdc=122270672; 3AB9D23F7A4B3C9B=F3DVZOZZIY4HWG2IDIZMN2EKWAM7OPZYR7EZBWT5HFZGUUV7UQTHSXYEW6A55TWAQVO3KOVJP7G64CNVJK4ABS4GCQ'
            }
    r=requests.get(url,headers=headers)
    soup=BeautifulSoup(r.text,'html.parser')
    all_detil=soup.find_all('ul',class_="parameter2 p-parameter-list")
    good_weight=re.findall('(?<=商品毛重:)(.+?)(?=</li>)',str(all_detil))
    good_cpu=re.findall('(?<=CPU型号:)(.+?)(?=</li>)',str(all_detil))
    good_yunhnc=re.findall('(?<=运行内存:)(.+?)(?=</li>)',str(all_detil))
    good_jscc=re.findall('(?<=机身存储:)(.+?)(?=</li>)',str(all_detil))
    good_cck=re.findall('(?<=存储卡:)(.+?)(?=</li>)',str(all_detil))
    good_hzsxt=re.findall('(?<=后摄主摄像素:)(.+?)(?=</li>)',str(all_detil))
    good_qzsxt=re.findall('(?<=前摄主摄像素:)(.+?)(?=</li>)',str(all_detil))
    good_dcrl=re.findall('(?<=电池容量(mAh):)(.+?)(?=</li>)',str(all_detil))
    good_ccxt=re.findall('(?<=操作系统:)(.+?)(?=</li>)',str(all_detil))
    good_jsys=re.findall('(?<=机身颜色:)(.+?)(?=</li>)',str(all_detil))
    detil_list.append(good_weight)
    detil_list.append(good_cpu)
    detil_list.append(good_yunhnc)
    detil_list.append(good_jscc)
    detil_list.append(good_cck)
    detil_list.append(good_hzsxt)
    detil_list.append(good_qzsxt)
    detil_list.append(good_dcrl)
    detil_list.append(good_ccxt)
    detil_list.append(good_jsys)
    return detil_list

for i in range(43,53):
    #print ("正在访问{}".format(url))
    url = 'https://list.jd.com/list.html?cat=9987,653,655&page={}&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=10#J_main'.format(str(i))
    driver.get(url)
    data=[]
    data = driver.page_source
    soup1=[]
    soup1 = BeautifulSoup(data, 'html.parser')
    all_list=[]
    all_list=soup1.find_all('li',class_="gl-item")
    div_name=[]
    div_price=[]
    div_ljpj=[]
    div_inurl=[]
    
    name=[]
    price=[]
    ljpj=[]
    inurl=[]
    indetil=[]
    
    d1=[]
    d2=[]
    d3=[]
    d4=[]
    d5=[]
    d6=[]
    d7=[]
    d8=[]
    d9=[]
    d10=[]
    
    for good in all_list:
        div_name.append(good.find_all('div',class_="p-name"))
        div_price.append(good.find_all('div',class_="p-price"))
        div_ljpj.append(good.find_all('div',class_="p-commit"))
        div_inurl.append(good.find_all('div',class_="p-img"))
    soup_name=BeautifulSoup(str(div_name),'html.parser')
#     soup_price=BeautifulSoup(str(div_price),'html.parser')
    soup_ljpj=BeautifulSoup(str(div_ljpj),'html.parser')
    soup_inurl=BeautifulSoup(str(div_inurl),'html.parser')
    all_name=soup_name.find_all('em')
#    all_price=soup_price.find_all('i')
    all_ljpj=soup_ljpj.find_all('a',class_="comment")
    all_inurl=soup_inurl.find_all('a')

    for names in all_name:
        name.append(names.get_text().strip())
    for ljpjs in all_ljpj:
        ljpj.append(ljpjs.get_text().strip())
    for prices in div_price:
        p=re.findall('[1-9][0-9]{1,}',str(prices))
        if p==[]:
            price.append([])
        else:
            price.append(p[0])
    for inurls in all_inurl:
        inurl.append(inurls['href'])
    
    for link in inurl:
        if link[0:7]=='http://' or link[0:8]=='https://':
            indetil=get_detil(link)
        else:
            indetil=get_detil('https:'+link)
        d1.append(indetil[0])
        d2.append(indetil[1])
        d3.append(indetil[2])
        d4.append(indetil[3])
        d5.append(indetil[4])
        d6.append(indetil[5])
        d7.append(indetil[6])
        d8.append(indetil[7])
        d9.append(indetil[8])
        d10.append(indetil[9])
    dataframe = pd.DataFrame({'name':name,'price':price,'ljpj':ljpj,'d1':d1,'d2':d2,'d3':d3,'d4':d4,'d5':d5,'d6':d6,'d7':d7,'d8':d8,'9':d9,'10':d10})
    #将DataFrame存储为csv,index表示是否显示行名,default=True
    dataframe.to_csv("D:\\新桌面\\1.csv",index=False,sep=',',mode='a',header=False,encoding="gbk")
    print('第',i,'页爬取完成')
    

转载:https://blog.csdn.net/Dig_DD/article/details/106165589
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场