import pandas 
      as pd
     
    
   
    
     
    
    
     
      import time
     
    
   
    
     
    
    
     
      import matplotlib.pyplot 
      as plt
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      car_price = pd.read_csv(
      "./car_price.csv")
     
    
   
    
     
    
    
     
      car_price.head()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      car_price.info()
     
    
   
    
     
    
    
     
      # car_price.duplicated().sum()

数据特征具体可区分为3大类：

第一类：汽车ID类属性

1 Car_ID 车号

3 CarName 车名

第二类：类别型变量（10个）

2 Symboling 保险风险评级

4 fueltype 燃料类型

5 aspiration 发动机吸气形式

6 doornumber 车门数

7 carbody 车身型式

8 drivewheel 驱动轮

9 enginelocation 发动机位置

15 enginetype 发动机型号

16 cylindernumber 气缸数

18 fuelsystem 燃油系统

第三类：连续数值型变量（14个）

10 wheelbase 轴距

11 carlength 车长

12 carwidth 车宽

13 carheight 车高

14 curbweight 整备质量（汽车净重）

17 enginesize 发动机尺寸

19 boreratio 气缸横截面面积与冲程比

20 stroke 发动机冲程

21 compressionratio 压缩比

22 horsepower 马力

23 peakrpm 最大功率转速

24 citympg 城市里程（每加仑英里数）

25 highwaympg 高速公路里程（每加仑英里数）

26 price(Dependent variable) 价格（因变量）

查看类别型变量


  
   
    
     
    
    
     
      # 提取类别变量的列名
     
    
   
    
     
    
    
     
      cate_columns=[
      'symboling',
      'fueltype',
      'aspiration',
      'doornumber',
      'carbody',
      'drivewheel',
      'enginelocation',
      'enginetype',
      'fuelsystem',
      'cylindernumber']
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #打印类别变量每个分类的取值情况
     
    
   
    
     
    
    
     
      for i 
      in cate_columns:
     
    
   
    
     
    
    
         
      print (i)
     
    
   
    
     
    
    
         
      print(
      set(car_price[i]))

symboling
{0, 1, 2, 3, -2, -1}
fueltype
{'gas', 'diesel'}
aspiration
{'std', 'turbo'}
doornumber
{'two', 'four'}
carbody
{'convertible', 'hatchback', 'wagon', 'sedan', 'hardtop'}
drivewheel
{'4wd', 'fwd', 'rwd'}
enginelocation
{'rear', 'front'}
enginetype
{'ohcv', 'ohcf', 'dohc', 'ohc', 'l', 'rotor', 'dohcv'}
fuelsystem
{'idi', 'mfi', '4bbl', '2bbl', 'mpfi', 'spfi', '1bbl', 'spdi'}
cylindernumber
{'eight', 'six', 'five', 'two', 'four', 'three', 'twelve'}

查看数值型变量


  
   
    
     
    
    
     
      #提取连续数值型变量特征数据(除了'car_ID'和'CarName')
     
    
   
    
     
    
    
     
      car_df=car_price.drop(['car_ID','
      CarName'],axis=
      1)
     
    
   
    
     
    
    
     
      #查看连续数值型情况，并是检查否有异常值
     
    
   
    
     
    
    
     
      #对数据进行描述性统计
     
    
   
    
     
    
    
     
      car_df.describe()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      # 描绘数据集的箱线图，查看异常值
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #提取连续数值型数据的列名
     
    
   
    
     
    
    
     
      num_cols=car_df.columns.drop(cate_columns)
     
    
   
    
     
    
    
     
      print(num_cols)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #绘制连续数值型数据的箱线图，检查异常值
     
    
   
    
     
    
    
     
      import seaborn 
      as sns
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      fig=plt.figure(figsize=(
      12,
      8))
     
    
   
    
     
    
    
     
      i=
      1
     
    
   
    
     
    
    
     
      for col 
      in num_cols:
     
    
   
    
     
    
    
     
          ax=fig.add_subplot(
      3,
      5,i)
     
    
   
    
     
    
    
     
          sns.boxplot(
      data=car_df[col],ax=ax)
     
    
   
    
     
    
    
     
          i=i+
      1
     
    
   
    
     
    
    
     
          plt.title(col) 
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      plt.subplots_adjust(wspace=
      0.4,hspace=
      0.3)
     
    
   
    
     
    
    
     
      plt.show()


  
   
    
     
    
    
     
      #查看数值型特征的相关系数
     
    
   
    
     
    
    
     
      df_corr
      =car_df.
      corr()
     
    
   
    
     
    
    
     
      df_corr[
      'price'].sort_values(ascending 
      = 
      False)

price               1.000000
enginesize          0.874145
curbweight          0.835305
horsepower          0.808139
carwidth            0.759325
carlength           0.682920
wheelbase           0.577816
boreratio           0.553173
carheight           0.119336
stroke              0.079443
compressionratio    0.067984
symboling          -0.079978
peakrpm            -0.085267
citympg            -0.685751
highwaympg         -0.697599
Name: price, dtype: float64


  
   
    
     
    
    
     
      f , ax = plt.subplots(figsize = (
      7, 
      7))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      plt.title(
      'Correlation of Numeric Features with Price',y=1,size=16)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      sns.heatmap(df_corr,square = 
      True,  vmax=
      0.8)

二、数据处理

cylindernumber

car_price['cylindernumber'] = car_price.cylindernumber.replace({'three':3,'four':4,'five':5,'six':6,'eight':8,'twelve':12})

CarName


  
   
    
     
    
    
     
      #去重查看CarName
     
    
   
    
     
    
    
     
      print(car_price[
      'CarName'].drop_duplicates())
      #验证是否object全部改为数值类型
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      carBrand = car_price[
      'CarName'].
      str.split(expand=
      True)[
      0]
      #根据车名提取品牌，车名中第一个词为品牌
     
    
   
    
     
    
    
     
      print(
      set(carBrand))

由 carlength构建新特征carSize


  
   
    
     
    
    
     
      # 由上面描述性统计可知，车身长范围为141.1~208.1英寸之间，可划分为6类
     
    
   
    
     
    
    
     
      bins=[
      min(car_df.carlength)-
      0.01,
      145.67,
      169.29,
      181.10,
      192.91,
      200.79,
      max(car_df.carlength)+
      0.01]
     
    
   
    
     
    
    
     
      label=[
      'A00',
      'A0',
      'A',
      'B',
      'C',
      'D']
     
    
   
    
     
    
    
     
      carSize=pd.cut(car_df.carlength,bins,labels=label)
     
    
   
    
     
    
    
     
      print(carSize)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #将车型大小分类放入数据集中
     
    
   
    
     
    
    
     
      car_price[
      'carSize']=carSize
     
    
   
    
     
    
    
     
      car_df[
      'carSize']=carSize
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #剔除carlength
     
    
   
    
     
    
    
     
      features=car_df.drop([
      'carlength'],axis=
      1)

处理类别型特征

对于类别型特征的取值，有大小意义的数据转换为数值型映射，没有大小意义（不同取值表示类别不同），进行独热编码。

LabelEncoder


  
   
    
     
    
    
     
      # 将取值具有大小意义的类别型变量数据转变为数值型映射
     
    
   
    
     
    
    
     
      features1=features.copy()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #使用LabelEncoder对不具实体数值数据编码
     
    
   
    
     
    
    
     
      from sklearn.preprocessing 
      import LabelEncoder
     
    
   
    
     
    
    
     
      carSize1=LabelEncoder().fit_transform(features1[
      'carSize'])
     
    
   
    
     
    
    
     
      features1[
      'carSize']=carSize1
     
    
   
    
     
    
    
     
      carSize1a

one-hot


  
   
    
     
    
    
     
      #对于类别离散型特征，取值间没有大小意义的，可采用one-hot编码
     
    
   
    
     
    
    
     
      cate=features1.select_dtypes(include='object').columns
     
    
   
    
     
    
    
     
      print(cate)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      features1=
      features1.
      join
      (pd.get_dummies(features1[cate])).
      drop
      (cate,axis=1)
     
    
   
    
     
    
    
     
      features1.
      head
      ()

特征归一化

获取的原始特征，必须对每一特征分别进行归一化，比如，特征A的取值范围是[-1000,1000]，特征B的取值范围是[-1,1].
如果使用logistic回归，w1*x1+w2*x2，因为x1的取值太大了，所以x2基本起不了作用。
所以，必须进行特征的归一化，每个特征都单独进行归一化。

连续型特征归一化：

1、均值归一化（方差为1，均值为0）

2、最大最小值归一化（0-1）

3、 x = (2x - max - min)/(max - min).线性放缩到[-1,1]

离散型特征（类别型特征）：

离散特征进行one-hot编码后，编码后的特征，其实每一维度的特征都可以看做是连续的特征。就可以跟对连续型特征的归一化方法一样，对每一维特征再进行归一化。比如归一化到[-1,1]或归一化到均值为0,方差为1

因为之前对类别型特征分别进行标签和独热编码，类别型特征已经可以看做连续特征，所以统一对所有特征进行归一化


  
   
    
     
    
    
     
      #对特征进行归一化
     
    
   
    
     
    
    
     
      from sklearn 
      import preprocessing
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      features1=preprocessing.
      MinMaxScaler().fit_transform(features1)
     
    
   
    
     
    
    
     
      features1=pd.
      DataFrame(features1)
     
    
   
    
     
    
    
     
      features1.head()

PCA降维


  
   
    
     
    
    
     
      #对数据集进行PCA降维（信息保留为99.99%）
     
    
   
    
     
    
    
     
      from sklearn.decomposition 
      import PCA
     
    
   
    
     
    
    
     
      pca=PCA(n_components=
      0.9999)  
      #保证降维后的数据保持90%的信息，则填0.9
     
    
   
    
     
    
    
     
      features2=pca.fit_transform(features1)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #降维后，每个主要成分的解释方差占比（解释PC携带的信息多少）
     
    
   
    
     
    
    
     
      ratio=pca.explained_variance_ratio_
     
    
   
    
     
    
    
     
      print(
      '各主成分的解释方差占比：',ratio)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #降维后有几个成分
     
    
   
    
     
    
    
     
      print(
      '降维后有几个成分：',
      len(ratio))
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #累计解释方差占比
     
    
   
    
     
    
    
     
      cum_ratio=np.cumsum(ratio)
      #cumsum函数通常用于计算一个数组各行的累加值
     
    
   
    
     
    
    
     
      print(
      '累计解释方差占比：',cum_ratio)

各主成分的解释方差占比： [2.34835648e-01 1.89291914e-01 1.11193502e-01 6.41024136e-02
 5.90453139e-02 4.54763783e-02 4.21689429e-02 3.65477617e-02
 2.97528000e-02 2.24095237e-02 1.98458305e-02 1.95803021e-02
 1.70780800e-02 1.47611074e-02 1.32208566e-02 1.19093756e-02
 9.01434709e-03 8.74908243e-03 7.28321292e-03 6.65001057e-03
 5.68867886e-03 4.89870846e-03 4.50894857e-03 3.81422315e-03
 3.45197486e-03 2.23759951e-03 2.14676779e-03 1.84529725e-03
 1.56025958e-03 1.22067828e-03 1.12126257e-03 1.03278716e-03
 8.30359553e-04 6.87972243e-04 5.63679041e-04 4.64609849e-04
 3.33065301e-04 2.76366954e-04 1.67241531e-04 1.07861538e-04
 7.49681455e-05]
降维后有几个成分： 41
累计解释方差占比： [0.23483565 0.42412756 0.53532106 0.59942348 0.65846879 0.70394517
 0.74611411 0.78266187 0.81241467 0.8348242  0.85467003 0.87425033
 0.89132841 0.90608952 0.91931037 0.93121975 0.9402341  0.94898318
 0.95626639 0.9629164  0.96860508 0.97350379 0.97801274 0.98182696
 0.98527894 0.98751654 0.9896633  0.9915086  0.99306886 0.99428954
 0.9954108  0.99644359 0.99727395 0.99796192 0.9985256  0.99899021
 0.99932327 0.99959964 0.99976688 0.99987474 0.99994971]


  
   
    
     
    
    
     
      #绘制PCA降维后各成分方差占比的直方图和累计方差占比折线图
     
    
   
    
     
    
    
     
      plt.figure(figsize=(
      8,
      6))
     
    
   
    
     
    
    
     
      X=
      range(
      1,
      len(ratio)+
      1)
     
    
   
    
     
    
    
     
      Y=ratio
     
    
   
    
     
    
    
     
      plt.bar(X,Y,edgecolor=
      'black')
     
    
   
    
     
    
    
     
      plt.plot(X,Y,
      'r.-')
     
    
   
    
     
    
    
     
      plt.plot(X,cum_ratio,
      'b.-')
     
    
   
    
     
    
    
     
      plt.ylabel(
      'explained_variance_ratio')
     
    
   
    
     
    
    
     
      plt.xlabel(
      'PCA')
     
    
   
    
     
    
    
     
      plt.show()


  
   
    
     
    
    
     
      #PCA选择降维保留8个主要成分
     
    
   
    
     
    
    
     
      pca=PCA(n_components=8) 
     
    
   
    
     
    
    
     
      features3=pca.fit_transform(features1)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #降维后的累计各成分方差占比和（即解释PC携带的信息多少）
     
    
   
    
     
    
    
     
      print(
      sum(pca.explained_variance_ratio_))
      #0.7826618733273734
     
    
   
    
     
    
    
     
      features3

三、K-means进行聚类

肘方法看k值


  
   
    
     
    
    
     
      ##肘方法看k值，簇内离差平方和
     
    
   
    
     
    
    
     
      #对每一个k值进行聚类并且记下对于的SSE，然后画出k和SSE的关系图
     
    
   
    
     
    
    
     
      from sklearn.cluster 
      import KMeans
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      sse=[]
     
    
   
    
     
    
    
     
      for i 
      in 
      range(
      1,
      15):
     
    
   
    
     
    
    
     
          km=KMeans(n_clusters=i,init=
      'k-means++',n_init=
      10,max_iter=
      300,random_state=
      0)
     
    
   
    
     
    
    
     
          km.fit(features3)
     
    
   
    
     
    
    
     
          sse.append(km.inertia_)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      plt.plot(
      range(
      1,
      15),sse,marker=
      '*')
     
    
   
    
     
    
    
     
      plt.xlabel(
      'n_clusters')
     
    
   
    
     
    
    
     
      plt.ylabel(
      'distortions')
     
    
   
    
     
    
    
     
      plt.title(
      "The Elbow Method")
     
    
   
    
     
    
    
     
      plt.show()

选择5个聚类点进行聚类


  
   
    
     
    
    
     
      #进行K-Means聚类分析
     
    
   
    
     
    
    
     
      kmeans=KMeans(n_clusters=5,init=
      'k-means++',n_init=10,max_iter=300,random_state=0)
     
    
   
    
     
    
    
     
      kmeans.fit(features3)
     
    
   
    
     
    
    
     
      lab=kmeans.predict(features3)
     
    
   
    
     
    
    
     
      print(lab)

聚类结果可视化


  
   
    
     
    
    
     
      #绘制聚类结果
      2维的散点图
     
    
   
    
     
    
    
     
      plt
      .figure(figsize=(
      8,
      8))
     
    
   
    
     
    
    
     
      plt
      .scatter(features3[:,
      0],features3[:,
      1],c=lab)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      for ii in np
      .arange(
      205):
     
    
   
    
     
    
    
     
          plt.
      text(features3[ii,
      0],features3[ii,
      1],s=car_price.car_ID[ii])
     
    
   
    
     
    
    
     
      plt.
      xlabel(
      'PC1')
     
    
   
    
     
    
    
     
      plt.
      ylabel(
      'PC2')
     
    
   
    
     
    
    
     
      plt.
      title(
      'K-Means PCA')
     
    
   
    
     
    
    
     
      plt.
      show()


  
   
    
     
    
    
     
      #绘制聚类结果后3d散点图
     
    
   
    
     
    
    
     
      from mpl_toolkits.mplot3d 
      import Axes3D
     
    
   
    
     
    
    
     
      plt.figure(figsize=(
      8,
      8))
     
    
   
    
     
    
    
     
      ax=plt.subplot(
      111,projection=
      '3d')
     
    
   
    
     
    
    
     
      ax.scatter(features3[:,
      0],features3[:,
      1],features3[:,
      2],c=lab)
     
    
   
    
     
    
    
     
      #视角转换，转换后更易看出簇群
     
    
   
    
     
    
    
     
      ax.view_init(
      30,
      45) 
     
    
   
    
     
    
    
     
      ax.set_xlabel(
      'PC1')
     
    
   
    
     
    
    
     
      ax.set_ylabel(
      'PC2')
     
    
   
    
     
    
    
     
      ax.set_zlabel(
      'PC3')
     
    
   
    
     
    
    
     
      plt.show()

轮廓系数判断k值


  
   
    
     
    
    
     
      #绘制轮廓图和
      3d散点图
     
    
   
    
     
    
    
     
      from sklearn
      .datasets import make_blobs
     
    
   
    
     
    
    
     
      from sklearn
      .metrics import silhouette_samples, silhouette_score
     
    
   
    
     
    
    
     
      import matplotlib
      .cm as cm
     
    
   
    
     
    
    
     
      from mpl_toolkits
      .mplot3d import Axes3D
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      for n_clusters in range(
      2,
      9):
     
    
   
    
     
    
    
     
          fig=plt.
      figure(figsize=(
      12,
      6))
     
    
   
    
     
    
    
     
          ax1=fig.
      add_subplot(
      121)
     
    
   
    
     
    
    
     
          ax2=fig.
      add_subplot(
      122,projection=
      '3d')
     
    
   
    
     
    
    
         
     
    
   
    
     
    
    
     
          ax1.
      set_xlim([-
      0.1,
      1])
     
    
   
    
     
    
    
     
          ax1.
      set_ylim([
      0,
      len(features3)+(n_clusters+
      1)*
      10])
     
    
   
    
     
    
    
     
          km=
      KMeans(n_clusters=n_clusters,init=
      'k-means++',n_init=
      10,max_iter=
      300,random_state=
      0)
     
    
   
    
     
    
    
     
          y_km=km.
      fit_predict(features3)
     
    
   
    
     
    
    
     
          silhouette_avg=
      silhouette_score(features3,y_km)
     
    
   
    
     
    
    
         
      print(
      'n_cluster=',n_clusters,
      'The average silhouette_score is :',silhouette_avg)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          cluster_labels=np.
      unique(y_km)   
     
    
   
    
     
    
    
     
          silhouette_vals=
      silhouette_samples(features3,y_km,metric=
      'euclidean')
     
    
   
    
     
    
    
     
          y_ax_lower=
      10
     
    
   
    
     
    
    
     
          for i in 
      range(n_clusters):
     
    
   
    
     
    
    
     
              c_silhouette_vals=silhouette_vals[y_km==i]
     
    
   
    
     
    
    
     
              c_silhouette_vals.
      sort()
     
    
   
    
     
    
    
     
              cluster_i=c_silhouette_vals.shape[
      0]
     
    
   
    
     
    
    
     
              y_ax_upper=y_ax_lower+cluster_i
     
    
   
    
     
    
    
     
              color=cm.
      nipy_spectral(
      float(i)/n_clusters)
     
    
   
    
     
    
    
     
              ax1.
      fill_betweenx(
      range(y_ax_lower,y_ax_upper),
      0,c_silhouette_vals,edgecolor=
      'none',color=color)
     
    
   
    
     
    
    
     
              ax1.
      text(-
      0.05,y_ax_lower+
      0.5*cluster_i,
      str(i))
     
    
   
    
     
    
    
     
              y_ax_lower=y_ax_upper+
      10
     
    
   
    
     
    
    
         
     
    
   
    
     
    
    
     
          ax1.
      set_title(
      'The silhouette plot for the various clusters')
     
    
   
    
     
    
    
     
          ax1.
      set_xlabel(
      'The silhouette coefficient values')
     
    
   
    
     
    
    
     
          ax1.
      set_ylabel(
      'Cluster label')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          ax1.
      axvline(x=silhouette_avg,color=
      'red',linestyle=
      '--')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          ax1.
      set_yticks([])
     
    
   
    
     
    
    
     
          ax1.
      set_xticks([-
      0.1,
      0,
      0.2,
      0.4,
      0.6,
      0.8,
      1.0])
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          colors=cm.
      nipy_spectral(y_km.
      astype(float)/n_clusters)
     
    
   
    
     
    
    
     
          ax2.
      scatter(features3[:,
      0],features3[:,
      1],features3[:,
      2],marker=
      '.',s=
      30,lw=
      0,alpha=
      0.7,c=colors,edgecolor=
      'k')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          centers=km.cluster_centers_
     
    
   
    
     
    
    
     
          ax2.
      scatter(centers[:,
      0],centers[:,
      1],centers[:,
      2],marker=
      'o',c=
      'white',alpha=
      1,s=
      200,edgecolor=
      'k')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          for i,c in 
      enumerate(centers):
     
    
   
    
     
    
    
     
              ax2.
      scatter(c[
      0],c[
      1],c[
      2],marker=
      '$%d$' % i,alpha=
      1,s=
      50,edgecolor=
      'k')
     
    
   
    
     
    
    
             
     
    
   
    
     
    
    
     
          ax2.
      set_title(
      "The visualization of the clustered data.")
     
    
   
    
     
    
    
     
          ax2.
      set_xlabel(
      "Feature space for the 1st feature")
     
    
   
    
     
    
    
     
          ax2.
      set_ylabel(
      "Feature space for the 2nd feature")
     
    
   
    
     
    
    
     
          ax2.
      view_init(
      30,
      45)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
          plt.
      suptitle((
      "Silhouette analysis for KMeans clustering on sample data "
     
    
   
    
     
    
    
                       
      "with n_clusters = %d" % n_clusters),
     
    
   
    
     
    
    
     
                       fontsize=
      14, fontweight=
      'bold')
     
    
   
    
     
    
    
     
      plt.
      show()

结合轮廓图和3d散点图：当k太小时，单独的集群会合并；而当k太大时，某些集群会被分成多个。

当k=2，每个集群很大且很大部分实例系数接近0，表明集群内很大部分实例接近边界，一些单独的集群被合并了，模型效果不好；

当k=3时，集群‘0’大部分实例轮廓系数低于集群的轮廓分数，且有小部分实例系数小于0趋向-1，说明该部分实例可能已分配给错误的集群；

k=4时，集群‘0’大部分实例轮廓系数低于集群的轮廓分数且接近0，说明这些实例接近边界，该集群可能分为2个单独集群更合适；

k=7或8时，某些集群被分成多个，中心非常接近，导致非常糟糕的模型；

当k为5或6时，大多数实例都超出虚线，集群看起来很好，聚类效果都很好。按得分排k更佳是6>5，当k=5时，集群‘3’很大，k=6时，各个集群分布更均衡一些；

综上所述，k值选取5或6都可以，聚类模型效果都可以，但考虑各集群均衡些，所以选取k=6。


  
   
    
     
    
    
     
      #调整选择k=6进行聚类
     
    
   
    
     
    
    
     
      kmeans=KMeans(n_clusters=6,init=
      'k-means++',n_init=10,max_iter=300,random_state=0)
     
    
   
    
     
    
    
     
      y_pred=kmeans.fit_predict(features3)
     
    
   
    
     
    
    
     
      print(y_pred)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #将聚类后的类目放入原特征数据中
     
    
   
    
     
    
    
     
      car_df_km=car_price.copy()
     
    
   
    
     
    
    
     
      car_df_km[
      'km_result']=y_pred

[4 4 4 1 5 3 5 5 5 0 4 5 4 5 5 5 4 5 3 3 1 3 3 0 1 1 1 0 1 0 3 3 3 3 3 1 1
 3 3 1 1 1 3 1 3 1 3 5 5 4 3 3 3 1 1 4 4 4 4 3 1 3 1 2 1 5 2 2 2 2 2 5 4 5
 4 0 3 3 3 0 0 3 0 0 0 1 1 1 1 3 2 3 1 1 3 3 1 1 3 1 1 5 5 5 4 4 4 5 2 5 2
 5 2 5 2 5 2 5 3 0 1 1 1 1 0 4 4 4 4 4 1 3 3 1 3 1 0 5 3 3 3 1 1 1 1 5 1 1
 1 5 3 3 1 1 1 1 1 1 2 2 1 1 1 3 3 4 4 4 4 4 4 4 4 1 2 1 1 1 4 4 5 5 2 3 2
 1 1 2 1 3 3 5 2 1 5 5 5 5 5 5 5 5 5 2 5]

四、分析聚类结果


  
   
    
     
    
    
     
      #统计聚类后每个集群中包含的车型数
     
    
   
    
     
    
    
     
      car_df_km
      .groupby('km_result')
      ['car_ID']
      .count()

km_result
0    13
1    59
2    20
3    43
4    31
5    39
Name: car_ID, dtype: int64


  
   
    
     
    
    
     
      import pandas 
      as pd
     
    
   
    
     
    
    
     
      #显示所有列
     
    
   
    
     
    
    
     
      pd.set_option(
      'display.max_columns',None)
     
    
   
    
     
    
    
     
      #显示所有行
     
    
   
    
     
    
    
     
      pd.set_option(
      'display.max_rows',None)
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #统计每个集群里各品牌的车型数
     
    
   
    
     
    
    
     
      car_df_km.groupby(
      by=[
      'km_result',
      'carBrand'])[
      'car_ID'].count()
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #统计每个品牌在各个集群里的车型数
     
    
   
    
     
    
    
     
      car_df_km.groupby(
      by=[
      'carBrand',
      'km_result'])[
      'car_ID'].count()


  
   
    
     
    
    
     
      #查看特指车名‘vokswagen’车型的聚类集群
     
    
   
    
     
    
    
     
      df=car_df_km.loc[:,[
      'car_ID',
      'CarName',
      'carBrand',
      'km_result']]
     
    
   
    
     
    
    
     
      print(df.loc[
      df[
      'CarName'].str.contains(
      "vokswagen")])
     
    
   
    
     
    
    
     
      # ’vokswagen’的车名为‘vokswagen rabbit’，car_ID 为183，集群分类为2.
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #查看特指车名为‘vokswagen’车型的竞品车型（分类2的所有车型）
     
    
   
    
     
    
    
     
      df.loc[
      df[
      'km_result']==2]


  
   
    
     
    
    
     
      #查看大众volkswagen品牌在各集群内的竞品车型
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      li = [1, 2,3,5] 
      #volkswagen品牌在1235这几个集群里分布
     
    
   
    
     
    
    
     
      df_volk=
      df[
      df[
      'km_result'].isin(li)].sort_values(by=[
      'km_result',
      'carBrand'])  
     
    
   
    
     
    
    
     
      df_volk

在全量数据里提取‘vokswagen’车型的竞品车型


  
   
    
     
    
    
     
      df0 = car_df_km.loc[car_df_km[
      'km_result']==
      2]
     
    
   
    
     
    
    
     
      df0.head()


  
   
    
     
    
    
     
      df0_1
      =df0.drop
      (
      [
      'car_ID'
      ,
      'CarName'
      ,
      'km_result'
      ]
      ,axis
      =
      1
      )
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #查看集群2的车型所有特征分布
     
    
   
    
     
    
    
     
      fig
      =plt.figure
      (figsize
      =
      (
      20
      ,
      20
      )
      )
     
    
   
    
     
    
    
     
      i
      =
      1
     
    
   
    
     
    
    
     
      for 
      c 
      in df0_1.columns
      :
     
    
   
    
     
    
    
     
          ax
      =fig.add_subplot
      (
      7
      ,
      4
      ,i
      ) 
     
    
   
    
     
    
    
         
      if df0_1
      [
      c
      ].dtypes
      ==
      'int' or df0_1
      [
      c
      ].dtypes
      ==
      'float'
      :
      #数值型变量
     
    
   
    
     
    
    
     
              sns.histplot
      (df0_1
      [
      c
      ]
      ,ax
      =ax
      )
      #直方图
     
    
   
    
     
    
    
         
      else
      :
     
    
   
    
     
    
    
     
              sns.barplot
      (df0_1
      [
      c
      ].value_counts
      (
      ).index
      ,df0_1
      [
      c
      ].value_counts
      (
      )
      ,ax
      =ax
      )
      #条形图3
     
    
   
    
     
    
    
     
          i
      =i
      +
      1
     
    
   
    
     
    
    
     
          plt.xlabel
      (
      ''
      )
     
    
   
    
     
    
    
     
          plt.title
      (
      c
      )  
     
    
   
    
     
    
    
     
      plt.subplots_adjust
      (top
      =
      1.2
      )
     
    
   
    
     
    
    
     
      plt.show
      (
      )

类别型变量取值只有一种的有：
fueltype : {‘diesel’}；enginelocation : {‘front’}；fuelsystem:{'idi'}

这些共性的特征在竞品分析时可不考虑


  
   
    
     
    
    
     
      #对不同车型级别、品牌、车身等类型特征进行数据透视
     
    
   
    
     
    
    
     
      #按车型大小级别进行对比
     
    
   
    
     
    
    
     
      df2=df
      0.pivot_table(
      index=[
      'carSize',
      'carbody',
      'carBrand',
      'CarName'])
     
    
   
    
     
    
    
     
      df2

				boreratio	car_ID	carheight	carlength	carwidth	citympg	compressionratio	curbweight	enginesize	highwaympg	horsepower	km_result	peakrpm	price	stroke	symboling	wheelbase
carSize	carbody	carBrand	CarName
A0	hatchback	toyota	toyota corolla	3.27	160	52.8	166.3	64.4	38	22.5	2275	110	47	56	2	4500	7788.0	3.35	0	95.7
	sedan	nissan	nissan gt-r	2.99	91	54.5	165.3	63.8	45	21.9	2017	103	50	55	2	4800	7099.0	3.47	1	94.5
	sedan	toyota	toyota corona	3.27	159	53.0	166.3	64.4	34	22.5	2275	110	36	56	2	4500	7898.0	3.35	0	95.7
A	sedan	mazda	mazda glc deluxe	3.39	64	55.5	177.8	66.5	36	22.7	2443	122	42	64	2	4650	10795.0	3.39	0	98.8
		mazda	mazda rx-7 gs	3.43	67	54.4	175.0	66.1	31	22.0	2700	134	39	72	2	4200	18344.0	3.64	0	104.9
		toyota	toyota celica gt	3.27	175	54.9	175.6	66.5	30	22.5	2480	110	33	73	2	4500	10698.0	3.35	-1	102.4
		volkswagen	vokswagen rabbit	3.01	183	55.7	171.7	65.5	37	23.0	2261	97	46	52	2	4800	7775.0	3.40	2	97.3
			volkswagen model 111	3.01	185	55.7	171.7	65.5	37	23.0	2264	97	46	52	2	4800	7995.0	3.40	2	97.3
			volkswagen rabbit custom	3.01	193	55.1	180.2	66.9	33	23.0	2579	97	38	68	2	4500	13845.0	3.40	0	100.4
			volkswagen super beetle	3.01	188	55.7	171.7	65.5	37	23.0	2319	97	42	68	2	4500	9495.0	3.40	2	97.3
B	hardtop	buick	buick century	3.58	70	54.9	187.5	70.3	22	21.5	3495	183	25	123	2	4350	28176.0	3.64	0	106.7
	sedan	buick	buick electra 225 custom	3.58	68	56.5	190.9	70.3	22	21.5	3515	183	25	123	2	4350	25552.0	3.64	-1	110.0
		peugeot	peugeot 304	3.70	109	56.7	186.7	68.4	28	21.0	3197	152	33	95	2	4150	13200.0	3.52	0	107.9
			peugeot 504	3.70	117	56.7	186.7	68.4	28	21.0	3252	152	33	95	2	4150	17950.0	3.52	0	107.9
			peugeot 604sl	3.70	113	56.7	186.7	68.4	28	21.0	3252	152	33	95	2	4150	16900.0	3.52	0	107.9
		volvo	volvo 246	3.01	204	55.5	188.8	68.9	26	23.0	3217	145	27	106	2	4800	22470.0	3.40	-1	109.1
	wagon	buick	buick century luxus (sw)	3.58	69	58.7	190.9	70.3	22	21.5	3750	183	25	123	2	4350	28248.0	3.64	-1	110.0
C	wagon	peugeot	peugeot 504	3.70	111	58.7	198.9	68.4	25	21.0	3430	152	25	95	2	4150	13860.0	3.52	0	114.2
C	wagon	peugeot	peugeot 505s turbo diesel	3.70	115	58.7	198.9	68.4	25	21.0	3485	152	25	95	2	4150	17075.0	3.52	0	114.2
D	sedan	buick	buick skyhawk	3.58	71	56.3	202.6	71.7	22	21.5	3770	183	25	123	2	4350	31600.0	3.64	-1	115.6

集群2中所有的车型大小级别为：A0小型车、A紧凑型车、B中型车、C中大型车、D豪华型车。
car_id183的车vokswagen rabbit属于A紧凑型车，其最直接的细分竞品为同属于a级的7辆车


  
   
    
     
    
    
     
      #提取集群2中的A级车
     
    
   
    
     
    
    
     
      df0_A=df0.loc[df0[
      'carSize']==
      'A']
     
    
   
    
     
    
    
     
      df0_A
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #查看集群0中A级车型的类别型变量的分类情况
     
    
   
    
     
    
    
     
      ate_col=df0_A.select_dtypes(
      include=
      'object').columns
     
    
   
    
     
    
    
     
      df3=df0_A[ate_col]
     
    
   
    
     
    
    
     
      df3


  
   
    
     
    
    
     
      #对集群0中A级车的特征进行数据透视
     
    
   
    
     
    
    
     
      df4=df0_A.pivot_table(
      index=[
      'carBrand',
      'CarName',
      'doornumber',
      'aspiration',
      'drivewheel'])
     
    
   
    
     
    
    
     
      df4

包含‘vokswagen rabbit’在内的7辆A级车中均有4个气缸，冲程范围在3.4-3.64，最大功率转速范围在4500-4800，压缩比范围在22.5-23.0，车身宽范围66.1-66.9，车高范围在54.4-55.7，气缸横截面面积与冲程比范围在3.01-3.43；以上这些数据都是比较相似的。

一般汽车关注点在：车型级别（carSize）、品牌（carBrand）、动力性能（马力horsepower）、质量安全（Symboling ）、油耗（citympg、highwaympg）、空间体验（轴距wheelbase）、车身（carbody、curbweight）等等。

下面提取其他一些不同关键特征进行考量‘vokswagen rabbit’与其他竞品之间的差异化：

基本信息：‘carBrand’，‘doornumber’, ‘curbweight’

油耗：‘highwaympg’、‘citympg’

安全性：‘symboling’

底盘制动：‘drivewheel’

动力性能：‘aspiration’, ‘enginesize’, ‘horsepower’

空间体验：‘wheelbase’

价格： ‘price’


  
   
    
     
    
    
     
      #对油耗的分析('citympg','highwaympg')
     
    
   
    
     
    
    
     
      lab=df0_A[
      'CarName']
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      fig,ax=plt.subplots(figsize=(
      10,
      8))
     
    
   
    
     
    
    
     
      ax.barh(
      range(
      len(lab)),df0_A[
      'highwaympg'],tick_label=lab,color=
      'red')
     
    
   
    
     
    
    
     
      ax.barh(
      range(
      len(lab)),df0_A[
      'citympg'],tick_label=lab,color=
      'blue')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      #在水平直方图上标注数据
     
    
   
    
     
    
    
     
      for i,(highway,city) 
      in 
      enumerate(
      zip(df0_A[
      'highwaympg'],df0_A[
      'citympg'])):
     
    
   
    
     
    
    
     
          ax.text(highway,i,highway,ha=
      'right')
     
    
   
    
     
    
    
     
          ax.text(city,i,city,ha=
      'right')
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      plt.legend((
      'highwaympg',
      'citympg'), loc=
      'upper right')
     
    
   
    
     
    
    
     
      plt.title(
      'miles per gallon')
     
    
   
    
     
    
    
     
      plt.show()


  
   
    
     
    
    
     
      #其他
      6个特征分析
     
    
   
    
     
    
    
     
      colors=[
      'yellow', 
      'blue', 
      'green',
      'red',  
      'gray',
      'tan',
      'darkviolet']
     
    
   
    
     
    
    
     
      col2=[
      'symboling',
      'wheelbase',
      'enginesize',
      'horsepower',
      'curbweight',
      'price']
     
    
   
    
     
    
    
     
      data=df0_A[col2]
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      fig=plt.figure(figsize=(
      10,
      8))
     
    
   
    
     
    
    
     
      i=
      1
     
    
   
    
     
    
    
     
      for c 
      in 
      data.columns:
     
    
   
    
     
    
    
     
          ax=fig.add_subplot(
      3,
      2,i)
     
    
   
    
     
    
    
     
          plt.barh(range(len(lab)),
      data[c],tick_label=lab,color=colors)
     
    
   
    
     
    
    
         
      for y,x 
      in enumerate(
      data[c].values):
     
    
   
    
     
    
    
     
              plt.text(x,y,
      "%s" %x)
     
    
   
    
     
    
    
     
          i=i+
      1
     
    
   
    
     
    
    
     
          plt.xlabel(
      '')
     
    
   
    
     
    
    
     
          plt.title(c)
     
    
   
    
     
    
    
     
      plt.subplots_adjust(top=
      1.2,wspace=
      0.7)
     
    
   
    
     
    
    
     
      plt.show()

由上面条形图，‘vokswagen rabbit’与其他竞品相比：

质量安全方面：其保险风险评级为2，比马自达品牌和丰田品牌车型相对更具有风险；

车身空间方面：轴距是最小的；

动力方面：发动机尺寸和马力都是最小的；

车重方面：整备质量最小的；

价格方面：价格是最小的；
综上所述，‘'vokswagen rabbit‘’与集群0中同是A级的竞品相比：

劣势：质量安全性偏低、车身空间偏小、动力马力偏小

优势：车身轻、油耗低、价格低（在类似的配置中性价比非常高）

设计特点：双车门三厢车

产品定位：“经济适用、城市代步紧凑型A级轿车”

建议：在销售推广时，可偏重于：①同类配置车型中超高的性价比；②油耗低，城市代步非常省油省钱；③车身小巧，停车方便；④双车门设计，个性独特

【算法竞赛学习】数据分析达人赛3:汽车产品聚类分析

转载：https://blog.csdn.net/m0_51933492/article/details/127397390

查看评论

小言_互联网的博客

小言_互联网的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

【数据分析】数据分析达人赛3:汽车产品聚类分析

赛题简介

赛题背景

赛题数据

一、查看数据