"pythonic生物人"的第155篇分享
 往期精彩戳:NGS精进 |统计精进| py基础 |py绘图 | perl基础 | R绘图
往期精彩戳:NGS精进 |统计精进| py基础 |py绘图 | perl基础 | R绘图
❝本文分享最常用的「11个分布(Distribution)关系图」。
续前篇:
python可视化45|最常用10个关联(Correlation)关系图
❞
目录

四、分布(Distribution)关系图
 
 21、连续变量堆积直方图(Stacked Histogram for Continuous Variable)
该图展示给定连续变量的频率分布。
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Prepare data
      
     
- 
     
      
     
     
      
       x_var = 
       'displ'
      
     
- 
     
      
     
     
      
       groupby_var = 
       'class'
      
     
- 
     
      
     
     
      
       df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
      
     
- 
     
      
     
     
      
       vals = [df[x_var].values.tolist() 
       for i, df in df_agg]
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       10, 
       6), dpi=
       80)
      
     
- 
     
      
     
     
      
       colors = [plt.cm.Set1(i / float(
       len(vals) - 
       1)) 
       for i in 
       range(
       len(vals))]
      
     
- 
     
      
     
     
      
       n, bins, patches = plt.hist(vals,
      
     
- 
     
      
     
     
      
                                   
       30,
      
     
- 
     
      
     
     
      
                                   stacked=True,
      
     
- 
     
      
     
     
      
                                   density=False,
      
     
- 
     
      
     
     
      
                                   color=colors[:
       len(vals)])
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       plt.legend({
      
     
- 
     
      
     
     
      
           group: col
      
     
- 
     
      
     
     
      
           
       for group, col in zip(
      
     
- 
     
      
     
     
      
               np.unique(df[groupby_var]).tolist(), colors[:
       len(vals)])
      
     
- 
     
      
     
     
      
       })
      
     
- 
     
      
     
     
      
       plt.title(f
       "Stacked Histogram of ${x_var}$ colored by ${groupby_var}$",
      
     
- 
     
      
     
     
      
                 fontsize=
       22)
      
     
- 
     
      
     
     
      
       plt.xlabel(x_var)
      
     
- 
     
      
     
     
      
       plt.ylabel(
       "Frequency")
      
     
- 
     
      
     
     
      
       #plt.ylim(
       0, 
       25)
      
     
- 
     
      
     
     
      
       plt.xticks(ticks=bins[::
       3], labels=[round(b, 
       1) 
       for b in bins[::
       3]])
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 22、类别变量堆积直方图(Stacked Histogram for Categorical Variable)
该图展示给定类别变量的频率分布。
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Prepare data
      
     
- 
     
      
     
     
      
       x_var = 
       'manufacturer'
      
     
- 
     
      
     
     
      
       groupby_var = 
       'class'
      
     
- 
     
      
     
     
      
       df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
      
     
- 
     
      
     
     
      
       vals = [df[x_var].values.tolist() 
       for i, df in df_agg]
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       10, 
       6), dpi=
       80)
      
     
- 
     
      
     
     
      
       colors = [plt.cm.Set1(i / float(
       len(vals) - 
       1)) 
       for i in 
       range(
       len(vals))]
      
     
- 
     
      
     
     
      
       n, bins, patches = plt.hist(vals,
      
     
- 
     
      
     
     
      
                                   df[x_var].unique().__len__(),
      
     
- 
     
      
     
     
      
                                   stacked=True,
      
     
- 
     
      
     
     
      
                                   density=False,
      
     
- 
     
      
     
     
      
                                   color=colors[:
       len(vals)])
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       plt.legend({
      
     
- 
     
      
     
     
      
           group: col
      
     
- 
     
      
     
     
      
           
       for group, col in zip(
      
     
- 
     
      
     
     
      
               np.unique(df[groupby_var]).tolist(), colors[:
       len(vals)])
      
     
- 
     
      
     
     
      
       })
      
     
- 
     
      
     
     
      
       plt.title(f
       "Stacked Histogram of ${x_var}$ colored by ${groupby_var}$",
      
     
- 
     
      
     
     
      
                 fontsize=
       22)
      
     
- 
     
      
     
     
      
       plt.xlabel(x_var)
      
     
- 
     
      
     
     
      
       plt.ylabel(
       "Frequency")
      
     
- 
     
      
     
     
      
       plt.ylim(
       0, 
       40)
      
     
- 
     
      
     
     
      
       plt.xticks(ticks=bins,
      
     
- 
     
      
     
     
      
                  labels=np.unique(df[x_var]).tolist(),
      
     
- 
     
      
     
     
      
                  rotation=
       90,
      
     
- 
     
      
     
     
      
                  horizontalalignment=
       'left')
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 了解更多直方图:
了解更多直方图:
23、密度图(Density Plot)
该图展示连续变量的分布情况。
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       10, 
       8), dpi=
       80)
      
     
- 
     
      
     
     
      
       sns.kdeplot(df.loc[df[
       'cyl'] == 
       4, 
       "cty"],
      
     
- 
     
      
     
     
      
                   shade=True,
      
     
- 
     
      
     
     
      
                   color=
       "#01a2d9",
      
     
- 
     
      
     
     
      
                   label=
       "Cyl=4",
      
     
- 
     
      
     
     
      
                   alpha=
       .7)
      
     
- 
     
      
     
     
      
       sns.kdeplot(df.loc[df[
       'cyl'] == 
       5, 
       "cty"],
      
     
- 
     
      
     
     
      
                   shade=True,
      
     
- 
     
      
     
     
      
                   color=
       "#dc2624",
      
     
- 
     
      
     
     
      
                   label=
       "Cyl=5",
      
     
- 
     
      
     
     
      
                   alpha=
       .7)
      
     
- 
     
      
     
     
      
       sns.kdeplot(df.loc[df[
       'cyl'] == 
       6, 
       "cty"],
      
     
- 
     
      
     
     
      
                   shade=True,
      
     
- 
     
      
     
     
      
                   color=
       "#C89F91",
      
     
- 
     
      
     
     
      
                   label=
       "Cyl=6",
      
     
- 
     
      
     
     
      
                   alpha=
       .7)
      
     
- 
     
      
     
     
      
       sns.kdeplot(df.loc[df[
       'cyl'] == 
       8, 
       "cty"],
      
     
- 
     
      
     
     
      
                   shade=True,
      
     
- 
     
      
     
     
      
                   color=
       "#649E7D",
      
     
- 
     
      
     
     
      
                   label=
       "Cyl=8",
      
     
- 
     
      
     
     
      
                   alpha=
       .7)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       sns.set(style=
       "whitegrid", font_scale=
       1.1)
      
     
- 
     
      
     
     
      
       plt.title(
       'Density Plot of City Mileage by n_Cylinders', fontsize=
       18)
      
     
- 
     
      
     
     
      
       plt.legend()
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 24、带直方图的密度图(Density Curves with Histogram)
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       10, 
       8), dpi=
       80)
      
     
- 
     
      
     
     
      
       sns.distplot(df.loc[df[
       'class'] == 
       'compact', 
       "cty"],
      
     
- 
     
      
     
     
      
                    color=
       "#01a2d9",
      
     
- 
     
      
     
     
      
                    label=
       "Compact",
      
     
- 
     
      
     
     
      
                    hist_kws={
       'alpha': 
       .7},
      
     
- 
     
      
     
     
      
                    kde_kws={
       'linewidth': 
       3})
      
     
- 
     
      
     
     
      
       sns.distplot(df.loc[df[
       'class'] == 
       'suv', 
       "cty"],
      
     
- 
     
      
     
     
      
                    color=
       "#dc2624",
      
     
- 
     
      
     
     
      
                    label=
       "SUV",
      
     
- 
     
      
     
     
      
                    hist_kws={
       'alpha': 
       .7},
      
     
- 
     
      
     
     
      
                    kde_kws={
       'linewidth': 
       3})
      
     
- 
     
      
     
     
      
       sns.distplot(df.loc[df[
       'class'] == 
       'minivan', 
       "cty"],
      
     
- 
     
      
     
     
      
                    color=
       "g",
      
     
- 
     
      
     
     
      
                    label=
       "#C89F91",
      
     
- 
     
      
     
     
      
                    hist_kws={
       'alpha': 
       .7},
      
     
- 
     
      
     
     
      
                    kde_kws={
       'linewidth': 
       3})
      
     
- 
     
      
     
     
      
       plt.ylim(
       0, 
       0.35)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       sns.set(style=
       "whitegrid", font_scale=
       1.1)
      
     
- 
     
      
     
     
      
       plt.title(
       'Density Plot of City Mileage by Vehicle Type', fontsize=
       18)
      
     
- 
     
      
     
     
      
       plt.legend()
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 更多核密度图:
更多核密度图:
25、山峰叠峦图(Joy Plot)
该图展示大量分组之间的关系,比heatmap形象。
   
    - 
     
      
     
     
      
       !pip install joypy#安装依赖包
      
     
- 
     
      
     
     
      
       #每组数据绘制核密度图,R中有ggjoy
      
     
- 
     
      
     
     
      
       import joypy
      
     
- 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       mpg = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       10, 
       6), dpi=
       80)
      
     
- 
     
      
     
     
      
       fig, axes = joypy.joyplot(mpg,
      
     
- 
     
      
     
     
      
                                 column=[
       'hwy', 
       'cty'],
      
     
- 
     
      
     
     
      
                                 by=
       "class",
      
     
- 
     
      
     
     
      
                                 ylim=
       'own',
      
     
- 
     
      
     
     
      
                                 colormap=plt.cm.Set1,
      
     
- 
     
      
     
     
      
                                 figsize=(
       10, 
       6))
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       plt.title(
       'Joy Plot of City and Highway Mileage by Class', fontsize=
       18)
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 26、分布点图(Distributed Dot Plot)
分布点图显示了按组划分的点的单变量分布。点色越浅,该区域中数据点的集中度越高。通过对中位数进行不同的着色,各组的实际位置会立即变得明显。
   
    - 
     
      
     
     
      
       import matplotlib.patches as mpatches
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Prepare Data
      
     
- 
     
      
     
     
      
       df_raw = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
      
       cyl_colors = {
       4: 
       'tab:red', 
       5: 
       'tab:green', 
       6: 
       'tab:blue', 
       8: 
       'tab:orange'}
      
     
- 
     
      
     
     
      
       df_raw[
       'cyl_color'] = df_raw.cyl.
       map(cyl_colors)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Mean and Median city mileage by 
       make
      
     
- 
     
      
     
     
      
       df = df_raw[[
       'cty',
      
     
- 
     
      
     
     
      
                    
       'manufacturer']].groupby(
       'manufacturer').apply(lambda x: x.mean())
      
     
- 
     
      
     
     
      
       df.sort_values(
       'cty', ascending=False, inplace=True)
      
     
- 
     
      
     
     
      
       df.reset_index(inplace=True)
      
     
- 
     
      
     
     
      
       df_median = df_raw[[
       'cty', 
       'manufacturer'
      
     
- 
     
      
     
     
      
                           ]].groupby(
       'manufacturer').apply(lambda x: x.median())
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw horizontal lines
      
     
- 
     
      
     
     
      
       fig, ax = plt.subplots(figsize=(
       11, 
       7), dpi=
       80)
      
     
- 
     
      
     
     
      
       ax.hlines(y=df.index,
      
     
- 
     
      
     
     
      
                 xmin=
       0,
      
     
- 
     
      
     
     
      
                 xmax=
       40,
      
     
- 
     
      
     
     
      
                 color=
       '#01a2d9',
      
     
- 
     
      
     
     
      
                 alpha=
       0.5,
      
     
- 
     
      
     
     
      
                 linewidth=
       .5,
      
     
- 
     
      
     
     
      
                 linestyles=
       'dashdot')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw the Dots
      
     
- 
     
      
     
     
      
       for i, 
       make in enumerate(df.manufacturer):
      
     
- 
     
      
     
     
      
           df_make = df_raw.loc[df_raw.manufacturer == 
       make, :]
      
     
- 
     
      
     
     
      
           ax.scatter(y=np.repeat(i, df_make.shape[
       0]),
      
     
- 
     
      
     
     
      
                      x=
       'cty',
      
     
- 
     
      
     
     
      
                      data=df_make,
      
     
- 
     
      
     
     
      
                      s=
       75,
      
     
- 
     
      
     
     
      
                      edgecolors=
       '#01a2d9',
      
     
- 
     
      
     
     
      
                      c=
       'w',
      
     
- 
     
      
     
     
      
                      alpha=
       0.5)
      
     
- 
     
      
     
     
      
           ax.scatter(y=i,
      
     
- 
     
      
     
     
      
                      x=
       'cty',
      
     
- 
     
      
     
     
      
                      data=df_median.loc[df_median.index == 
       make, :],
      
     
- 
     
      
     
     
      
                      s=
       75,
      
     
- 
     
      
     
     
      
                      c=
       '#dc2624')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Annotate
      
     
- 
     
      
     
     
      
       ax.text(
       33,
      
     
- 
     
      
     
     
      
               
       13,
      
     
- 
     
      
     
     
      
               
       "$red \; dots \; are \; the \: median$",
      
     
- 
     
      
     
     
      
               fontdict={
       'size': 
       12},
      
     
- 
     
      
     
     
      
               color=
       '#dc2624')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decorations
      
     
- 
     
      
     
     
      
       red_patch = plt.plot([], [],
      
     
- 
     
      
     
     
      
                            marker=
       "o",
      
     
- 
     
      
     
     
      
                            ms=
       10,
      
     
- 
     
      
     
     
      
                            ls=
       "",
      
     
- 
     
      
     
     
      
                            mec=None,
      
     
- 
     
      
     
     
      
                            color=
       '#dc2624',
      
     
- 
     
      
     
     
      
                            label=
       "Median")
      
     
- 
     
      
     
     
      
       plt.legend(handles=red_patch)
      
     
- 
     
      
     
     
      
       ax.set_title(
       'Distribution of City Mileage by Make', fontdict={
       'size': 
       18})
      
     
- 
     
      
     
     
      
       ax.set_xlabel(
       'Miles Per Gallon (City)')
      
     
- 
     
      
     
     
      
       ax.set_yticks(df.index)
      
     
- 
     
      
     
     
      
       ax.set_yticklabels(df.manufacturer.str.title(),
      
     
- 
     
      
     
     
      
                          fontdict={
       'horizontalalignment': 
       'right'})
      
     
- 
     
      
     
     
      
       ax.set_xlim(
       1, 
       40)
      
     
- 
     
      
     
     
      
       plt.gca().spines[
       "top"].set_visible(False)
      
     
- 
     
      
     
     
      
       plt.gca().spines[
       "bottom"].set_visible(False)
      
     
- 
     
      
     
     
      
       plt.gca().spines[
       "right"].set_visible(False)
      
     
- 
     
      
     
     
      
       plt.gca().spines[
       "left"].set_visible(False)
      
     
- 
     
      
     
     
      
       plt.grid(axis=
       'both', alpha=
       .4, linewidth=
       .1)
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 27、箱图(boxplot)
很好的展示数据的分布情况~
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       10, 
       6), dpi=
       80)
      
     
- 
     
      
     
     
      
       sns.boxplot(
      
     
- 
     
      
     
     
      
           x=
       'class',
      
     
- 
     
      
     
     
      
           y=
       'hwy',
      
     
- 
     
      
     
     
      
           data=df,
      
     
- 
     
      
     
     
      
           notch=False,
      
     
- 
     
      
     
     
      
           palette=
       "Set1",
      
     
- 
     
      
     
     
      
       )
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Add N Obs inside boxplot (optional)
      
     
- 
     
      
     
     
      
       def add_n_obs(df, group_col, y):
      
     
- 
     
      
     
     
      
           medians_dict = {
      
     
- 
     
      
     
     
      
               grp[
       0]: grp[
       1][y].median()
      
     
- 
     
      
     
     
      
               
       for grp in df.groupby(group_col)
      
     
- 
     
      
     
     
      
           }
      
     
- 
     
      
     
     
      
           xticklabels = [x.get_text() 
       for x in plt.gca().get_xticklabels()]
      
     
- 
     
      
     
     
      
           n_obs = df.groupby(group_col)[y].size().values
      
     
- 
     
      
     
     
      
           
       for (x, xticklabel), n_ob in zip(enumerate(xticklabels), n_obs):
      
     
- 
     
      
     
     
      
               plt.text(x,
      
     
- 
     
      
     
     
      
                        medians_dict[xticklabel] * 
       1.01,
      
     
- 
     
      
     
     
      
                        
       "#obs : " + str(n_ob),
      
     
- 
     
      
     
     
      
                        horizontalalignment=
       'center',
      
     
- 
     
      
     
     
      
                        fontdict={
       'size': 
       12},
      
     
- 
     
      
     
     
      
                        color=
       'black')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       add_n_obs(df, group_col=
       'class', y=
       'hwy')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       sns.set(style=
       "whitegrid", font_scale=
       1.1)
      
     
- 
     
      
     
     
      
       plt.title(
       'Box Plot of Highway Mileage by Vehicle Class', fontsize=
       16)
      
     
- 
     
      
     
     
      
       plt.ylim(
       10, 
       40)
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 28、箱图结合点图(Dot + Box Plot)
该图展示箱图及箱图绘制所用的详细点。
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       13, 
       10), dpi=
       80)
      
     
- 
     
      
     
     
      
       sns.boxplot(
      
     
- 
     
      
     
     
      
           x=
       'class',
      
     
- 
     
      
     
     
      
           y=
       'hwy',
      
     
- 
     
      
     
     
      
           data=df,
      
     
- 
     
      
     
     
      
           hue=
       'cyl',
      
     
- 
     
      
     
     
      
           palette=
       "Set1",
      
     
- 
     
      
     
     
      
       )
      
     
- 
     
      
     
     
      
       plt.legend(loc=
       9)
      
     
- 
     
      
     
     
      
       sns.stripplot(x=
       'class',
      
     
- 
     
      
     
     
      
                     y=
       'hwy',
      
     
- 
     
      
     
     
      
                     data=df,
      
     
- 
     
      
     
     
      
                     color=
       '#dc2624',
      
     
- 
     
      
     
     
      
                     size=
       5,
      
     
- 
     
      
     
     
      
                     jitter=
       1)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       for i in 
       range(
       len(df[
       'class'].unique()) - 
       1):
      
     
- 
     
      
     
     
      
           plt.vlines(i + 
       .5, 
       10, 
       45, linestyles=
       'solid', colors=
       'gray', alpha=
       0.2)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       plt.title(
       'Box Plot of Highway Mileage by Vehicle Class', fontsize=
       18)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 更多关于箱图:
更多关于箱图:
29、小提琴图(Violin Plot)
比箱图更好看,但不常用,小提琴的形状或面积由该位置数据次数决定。
   
    - 
     
      
     
     
      
       # Import Data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/mpg_ggplot2.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       13, 
       10), dpi=
       80)
      
     
- 
     
      
     
     
      
       sns.violinplot(x=
       'class',
      
     
- 
     
      
     
     
      
                      y=
       'hwy',
      
     
- 
     
      
     
     
      
                      data=df,
      
     
- 
     
      
     
     
      
                      scale=
       'width',
      
     
- 
     
      
     
     
      
                      palette=
       'Set1',
      
     
- 
     
      
     
     
      
                      inner=
       'quartile')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decoration
      
     
- 
     
      
     
     
      
       plt.title(
       'Violin Plot of Highway Mileage by Vehicle Class', fontsize=
       18)
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 30、金字塔图(Population Pyramid)
可以理解为一种排过序的分组水平柱状图barplot,可很好展示不同分组之间的差异,可可视化逐级过滤或者漏斗的每个阶段。
   
    - 
     
      
     
     
      
       # Read data
      
     
- 
     
      
     
     
      
       df = pd.read_csv(
       "./datasets/email_campaign_funnel.csv")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Draw Plot
      
     
- 
     
      
     
     
      
       plt.figure(figsize=(
       12, 
       8), dpi=
       80)
      
     
- 
     
      
     
     
      
       group_col = 
       'Gender'
      
     
- 
     
      
     
     
      
       order_of_bars = df.Stage.unique()[::
       -1]
      
     
- 
     
      
     
     
      
       colors = [
      
     
- 
     
      
     
     
      
           plt.cm.Set1(i / float(
       len(df[group_col].unique()) - 
       1))
      
     
- 
     
      
     
     
      
           
       for i in 
       range(
       len(df[group_col].unique()))
      
     
- 
     
      
     
     
      
       ]
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       for c, group in zip(colors, df[group_col].unique()):
      
     
- 
     
      
     
     
      
           sns.barplot(x=
       'Users',
      
     
- 
     
      
     
     
      
                       y=
       'Stage',
      
     
- 
     
      
     
     
      
                       data=df.loc[df[group_col] == group, :],
      
     
- 
     
      
     
     
      
                       order=order_of_bars,
      
     
- 
     
      
     
     
      
                       color=c,
      
     
- 
     
      
     
     
      
                       label=group)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # Decorations
      
     
- 
     
      
     
     
      
       plt.xlabel(
       "$Users$")
      
     
- 
     
      
     
     
      
       plt.ylabel(
       "Stage of Purchase")
      
     
- 
     
      
     
     
      
       plt.yticks(fontsize=
       12)
      
     
- 
     
      
     
     
      
       plt.title(
       "Population Pyramid of the Marketing Funnel", fontsize=
       18)
      
     
- 
     
      
     
     
      
       plt.legend()
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 31、分类图(Categorical Plots)
展示彼此相关多个(>=2个)分类变量的计数分布,其实就是seaborn的分面图。
   
    - 
     
      
     
     
      
       # Load Dataset
      
     
- 
     
      
     
     
      
       titanic = pd.read_csv(
       './datasets/titanic.csv')
      
     
- 
     
      
     
     
      
       # Plot
      
     
- 
     
      
     
     
      
       g = sns.catplot(
       "alive",
      
     
- 
     
      
     
     
      
                       col=
       "deck",
      
     
- 
     
      
     
     
      
                       col_wrap=
       4,
      
     
- 
     
      
     
     
      
                       data=titanic[titanic.deck.notnull()],
      
     
- 
     
      
     
     
      
                       kind=
       "count",
      
     
- 
     
      
     
     
      
                       height=
       3.5,
      
     
- 
     
      
     
     
      
                       aspect=
       .8,
      
     
- 
     
      
     
     
      
                       palette=
       'Set1')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       plt.show()
      
     
 
 
 
   
    - 
     
      
     
     
      
       # Plot
      
     
- 
     
      
     
     
      
       sns.catplot(x=
       "age",
      
     
- 
     
      
     
     
      
                   y=
       "embark_town",
      
     
- 
     
      
     
     
      
                   hue=
       "sex",
      
     
- 
     
      
     
     
      
                   col=
       "class",
      
     
- 
     
      
     
     
      
                   data=titanic[titanic.embark_town.notnull()],
      
     
- 
     
      
     
     
      
                   orient=
       "h",
      
     
- 
     
      
     
     
      
                   height=
       5,
      
     
- 
     
      
     
     
      
                   aspect=
       1,
      
     
- 
     
      
     
     
      
                   palette=
       "Set1",
      
     
- 
     
      
     
     
      
                   kind=
       "violin",
      
     
- 
     
      
     
     
      
                   dodge=True,
      
     
- 
     
      
     
     
      
                   cut=
       0,
      
     
- 
     
      
     
     
      
                   bw=
       .2)
      
     
 更多关于分面图:
更多关于分面图:

 有用请“点赞”“在看”“分享”
有用请“点赞”“在看”“分享”


转载:https://blog.csdn.net/qq_21478261/article/details/113750415
查看评论
					 
					