Reference
1. Classification on imbalanced data
本文参照Tensorflow官方快速指南,说明如何对高度不平衡的数据集进行分类。使用kaggle信用卡欺诈检测数据集试验,该数据集是二分类数据集,包含正例样本492,负例样本28431。
显然不能完全通过正确率评价模型,因为若将所有样本预测为负例,正确率为99.83%,本文介绍几种处理类别不平衡数据的方法,并使用多种指标评测,如 Recall、Precision、AUC、ROC、Confusion Matrix 等。
完整代码见github。
初始bias偏置
输出层神经元数为1,使用逻辑回归输出类别似然概率,即
假设初始
接近于0,则输出概率主要受偏差
影响。对于样本均衡数据集,初始令
,则样本属于正负类的概率均为1/2,符合预期。
对于类别不平衡样本,若在初始化时将模型预测类别倾向于样本数多的类别,则模型将在训练阶段偏向于对所属类别样本数较少的样本的预测进行修正,专注于学习样本数小的类别的差异化特征,避免模型在前几次迭代时学习不太可能是正例的样本的特征。
举例来说,对于以上类别不平衡二分类数据集,初始预测样本为负例的概率为99.83%,则模型将在训练过程中,偏向于修正实际正例但被预测为负例的样本,即提高召回率。
如何设定初始bias偏置?初始化,我们预期样本的类别概率正比于类别样本数并假设
。若训练集正例样本的概率
则初始输出层神经元偏置(二分类只有一个神经元)
Class Weight, 样本/类别加权
给每个样本根据其所在类别样本数赋予权重,样本数少的类别权重更大,即样本错分的损失更大,使模型在训练时更加关注大权重样本的分类。
Oversampling, 过采样
在训练时,对样本数少的类别进行过采样。
Tensorflow实现
导入依赖包
import os
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
读取数据集,切分、缩放范围、标准化
# dataset processing
df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
# You don't want the `Time` column.
df.pop('Time')
# The `Amount` column covers a huge range. Convert to log-space.
eps = 0.001 # 0 => 0.1¢
df['Amount'] = np.log(df.pop('Amount') + eps)
(neg, pos), total = np.bincount(df['Class']), df.shape[0]
print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))
y = df.pop('Class').values
x = df.values
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)
# standard normalization
scaler = StandardScaler()
train_x = np.clip(scaler.fit_transform(train_x), -5, 5)
test_x = np.clip(scaler.transform(test_x), -5, 5)
"""
Examples:
Total: 284807
Positive: 492 (0.17% of total)
"""
基础DNN模型
metrics = [
keras.metrics.TruePositives(name='tp'),
keras.metrics.FalsePositives(name='fp'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.FalseNegatives(name='fn'),
keras.metrics.BinaryAccuracy(name='accuracy'),
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.AUC(name='auc')
]
BATCH_SIZE = 2048
BUFFER_SIZE = 100000
EPOCHS = 30
model = keras.Sequential([
keras.layers.Dense(16, activation='relu', input_shape=(train_x.shape[-1], )),
keras.layers.Dropout(0.5),
keras.layers.Dense(1, activation='sigmoid')
])
# max mode: monitor has stopped increasing
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_auc',
verbose=1,
patience=20,
mode='max',
restore_best_weights=True
)
model.compile(
optimizer=keras.optimizers.Adam(lr=1e-3),
loss=keras.losses.BinaryCrossentropy(),
metrics=metrics
)
# checkpoint the initial weights
initial_weights = model.get_weights()
1. 基础模型训练
print('Training Zero Bias Model')
model.set_weights(initial_weights)
model.layers[-1].bias.assign([0.0])
zero_bias_history = model.fit(
train_x,
train_y,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_data=(test_x, test_y),
verbose=0,
callbacks=[early_stopping],
)
zero_preditions = model.predict(test_x, batch_size=BATCH_SIZE)
2. bias偏置模型训练
print('Training Care Bias Model')
model.set_weights(initial_weights)
model.layers[-1].bias.assign([np.log(pos/neg, dtype=np.float32)])
care_bias_history = model.fit(
train_x,
train_y,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_data=(test_x, test_y),
verbose=0,
callbacks=[early_stopping],
)
care_preditions = model.predict(test_x, batch_size=BATCH_SIZE)
3. 类别加权模型训练
print('Training Weighted Model')
weight_for_0 = total / neg / 2.0
weight_for_1 = total / pos / 2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
model.set_weights(initial_weights)
weighted_history = model.fit(
train_x,
train_y,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_data=(test_x, test_y),
verbose=0,
callbacks=[early_stopping],
class_weight=class_weight,
)
weighted_preditions = model.predict(test_x, batch_size=BATCH_SIZE)
4. 过采样模型训练
print('Training Oversampling Model')
train_ds = tf.data.Dataset.from_tensor_slices((train_x, train_y))
# filter is so slowly
# pos_ds = train_ds.filter(lambda x,y: y == 1).shuffle(100000).repeat()
# neg_ds = train_ds.filter(lambda x,y: y == 0).shuffle(100000).repeat()
pos_indices = train_y == 1
pos_x, neg_x = train_x[pos_indices], train_x[~pos_indices]
pos_y, neg_y = train_y[pos_indices], train_y[~pos_indices]
pos_ds = tf.data.Dataset.from_tensor_slices((pos_x, pos_y)).shuffle(BUFFER_SIZE).repeat()
neg_ds = tf.data.Dataset.from_tensor_slices((neg_x, neg_y)).shuffle(BUFFER_SIZE).repeat()
resampled_ds = tf.data.experimental.sample_from_datasets([pos_ds, neg_ds], weights=[0.5, 0.5])
resampled_ds = resampled_ds.batch(BATCH_SIZE).prefetch(2)
resampled_steps_per_epoch = np.ceil(2.0*neg/BATCH_SIZE)
test_ds = tf.data.Dataset.from_tensor_slices((test_x, test_y)).cache()
test_ds = test_ds.batch(BATCH_SIZE).prefetch(2)
model.set_weights(initial_weights)
oversampling_history = model.fit(
resampled_ds,
epochs=EPOCHS,
steps_per_epoch=resampled_steps_per_epoch,
validation_data=test_ds,
verbose=0,
callbacks=[early_stopping]
)
oversampling_preditions = model.predict(test_x, batch_size=BATCH_SIZE)
不同模型训练曲线汇总
def plot_metrics(histories, targets, predictions, labels, p=0.5):
"""打印训练过程和预测结果"""
# evaluation index
plt.figure(figsize=(16.0, 9.0))
for i, metric in enumerate(['loss', 'precision', 'recall', 'auc']):
plt.subplot(2,2,i+1)
for n, history in enumerate(histories):
f_plot = plt.semilogy if metric == 'loss' else plt.plot
f_plot(history.epoch, history.history[metric], color=colors[n], label='Train ' + labels[n])
f_plot(history.epoch, history.history['val_' + metric], color=colors[n], label='Val '+labels[n], linestyle="--")
plt.xlabel('Epoch')
plt.ylabel(metric.capitalize())
plt.legend()
# roc curve
plt.figure(figsize=(12.0, 8.0))
for i, (target, prediction) in enumerate(zip(targets, predictions)):
fp, tp, _ = sklearn.metrics.roc_curve(target, prediction,)
plt.plot(100*fp, 100*tp, label=labels[i], linewidth=1.5, color=colors[i])
plt.xlabel('False positives [%]')
plt.ylabel('True positives [%]')
plt.xlim([-0.5,EPOCHS])
plt.ylim([80,100.5])
plt.legend()
plt.grid(True)
ax = plt.gca()
ax.set_aspect('equal')
# confusion matrix
plt.figure(figsize=(16.0, 9.0))
for i, (target, prediction) in enumerate(zip(targets, predictions)):
plt.subplot(2, 2, i+1)
cm = confusion_matrix(target, prediction > p)
sns.heatmap(cm, annot=True, fmt='d')
plt.title(labels[i] + ' Confusion Matrix @{:.2f}'.format(p))
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plot_metrics(
histories=[zero_bias_history, care_bias_history, weighted_history, oversampling_history],
targets=[test_y, test_y, test_y, test_y],
predictions=[zero_preditions, care_preditions, weighted_preditions, oversampling_preditions],
labels=['Zero Bias', 'Care Bias', 'Weighted', 'Oversampling']
)
转载:https://blog.csdn.net/sinat_34072381/article/details/106221893