小言_互联网的博客

stacking-具体代码

329人阅读  评论(0)

1、问题描述:

使用将5个弱学习器结合起来的强学习器,实现对DDos攻击的识别。

# coding=utf8
import pandas as pd                         #导入pandas包
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets.samples_generator import make_blobs
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder



'''创建数据集'''
# X为样本特征矩阵,行数为样本数(n_samples),列数为特征数目(n_features默认2), ;Y为对应的标签值,center为2表示Y取值为0,1两类。
# 导入数据集
data = pd.read_csv("E:/电科/CIC-IDS-2017/MachineLearningCVE/trainddos_0910.csv",low_memory=False)             #读取csv文件
print(data)

# 特征集处理
# def harmonize_data(data):
#     # 填充空数据 和 把string数据转成integer表示
#     data.loc[data["Label"] == "BENIGN", "Label"] = 0
#     data.loc[data["Label"] == "DDoS", "Label"] = 1
#     return data
#
# data = harmonize_data(data)

#  分别生成特征矩阵、标签矩阵
data=np.array(data) #将data转成numpy矩阵
X_train = data[0:200000,0:20]     # 大训练集特征矩阵X_train
Y_train_O = data[0:200000,78]      # 大训练集标签矩阵Y_train
print("大训练集X_train:",X_train.shape)
print("大训练集Y_train:",Y_train_O.shape)

label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train_O)


'''测试集'''
# X_train为大训练集,X_test为大测试集。Y_train, Y_test为对应的标签
#  分别生成特征矩阵、标签矩阵
X_test = data[200000:225700,0:20]     # 大测试集特征矩阵
Y_test = data[200000:225700,78]      # 大测试集标签矩阵
print("大测试集X_test:",X_test.shape)
print("大测试集Y_test:",Y_test.shape)

'''模型融合中使用到的各个单模型'''
clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]

'''生成0矩阵用来存放新的training set(10000*5),新的testing set(2500*5)'''
blend_train = np.zeros((X_train.shape[0], len(clfs)))
print("新训练集blend_train大小:",blend_train.shape)
blend_test = np.zeros((X_test.shape[0], len(clfs)))
print("新测试集blend_test大小:",blend_test.shape)

'''5折stacking'''
n_folds = 5
i=0

'''list() 方法用于将元组转换为列表。注:元组与列表是非常类似的,区别在于元组的元素值不能修改,元组是放在括号中,列表是放于方括号中。'''
skf = StratifiedKFold(n_folds,shuffle=True)
print(skf)

for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    print("模型",j, clf)
    # blend_test_j相当于分析中的【b1,b2,b3,b4,b5】
    # 大测试集行数x折叠次数,在稍后采用预测的平均值
    blend_test_j = np.zeros((X_test.shape[0], n_folds))
    print("储存【b1,b2,b3,b4,b5】的blend_test_j大小:",blend_test_j.shape)

    for  train_index, test_index in skf.split(X_train, Y_train):
        '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
        print("第",i,"折")
        # 生成小train,小test。X为样本(行)+特征(列)矩阵,y为标签向量
        print("小TRAIN行编号:", train_index, "小TEST行编号:", test_index)
        x_smatrain, y_smatrain, x_smatest, y_smatest = X_train[train_index], Y_train[train_index], X_train[test_index], Y_train[test_index]

        print("生成的小train大小:",x_smatrain.shape)
        print("生成的小test大小:",x_smatest.shape   )

        # 内循环,对小test进行预测,一个模型预测5次,组合得到【a1,a2,a3,a4,a5】(转置形成A1(10000*1)),就是这里的blend_train的1列。
        '''循环完毕(5个模型*5次折叠),新的训练集blend_train=【A1,A2,A3,A4,A5】(10000*5)'''
        clf.fit(x_smatrain, y_smatrain)
        # 对小test进行预测,结果保存在新的训练集中,行号对应相应的小test的行号
        blend_train[test_index, j] = clf.predict(x_smatest)
        print("小test的预测结果矩阵a",i,"大小:",clf.predict(x_smatest).shape)
        print("a", i, clf.predict(x_smatest))
        print("模型",j,"对小test的预测结果矩阵blend_train 【A1,A2,A3,A4,A5】:", blend_train)

        # 对大test进行预测,得到[b1,b2,b3,b4,b5]
        blend_test_j[:, i] = clf.predict(X_test)
        print("b",i,clf.predict(X_test))
        print("模型",j,"对大test的预测结果矩阵blend_test_j [b1,b2,b3,b4,b5]:",blend_test_j)
        i = (i + 1) % 5

    '''取b1,b2,b3,b4,b5的均值形成Bj(外循环完毕形成新的测试集blend_test=【B1,B2,B3,B4,B5】(2500*5))'''
    blend_test[:, j] = np.mean(blend_test_j, axis =  1)


print("大test的预测结果矩阵[B1,B2,B3,B4,B5]:",blend_test)

# clf = LogisticRegression()
# 接着用blend_train, Y_dev去训练第二层的学习器LogisticRegression
bclf = LogisticRegression()
bclf.fit(blend_train, Y_train)


# 用最终的强学习器进行预测
Y_test_predict = bclf.predict(blend_test)
score = metrics.accuracy_score(Y_test, Y_test_predict)
print("Accuracy = %s" % (score))

 


转载:https://blog.csdn.net/Longtermevolution/article/details/101029002
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场