1、问题描述:
使用将5个弱学习器结合起来的强学习器,实现对DDos攻击的识别。
# coding=utf8
import pandas as pd #导入pandas包
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets.samples_generator import make_blobs
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
'''创建数据集'''
# X为样本特征矩阵,行数为样本数(n_samples),列数为特征数目(n_features默认2), ;Y为对应的标签值,center为2表示Y取值为0,1两类。
# 导入数据集
data = pd.read_csv("E:/电科/CIC-IDS-2017/MachineLearningCVE/trainddos_0910.csv",low_memory=False) #读取csv文件
print(data)
# 特征集处理
# def harmonize_data(data):
# # 填充空数据 和 把string数据转成integer表示
# data.loc[data["Label"] == "BENIGN", "Label"] = 0
# data.loc[data["Label"] == "DDoS", "Label"] = 1
# return data
#
# data = harmonize_data(data)
# 分别生成特征矩阵、标签矩阵
data=np.array(data) #将data转成numpy矩阵
X_train = data[0:200000,0:20] # 大训练集特征矩阵X_train
Y_train_O = data[0:200000,78] # 大训练集标签矩阵Y_train
print("大训练集X_train:",X_train.shape)
print("大训练集Y_train:",Y_train_O.shape)
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train_O)
'''测试集'''
# X_train为大训练集,X_test为大测试集。Y_train, Y_test为对应的标签
# 分别生成特征矩阵、标签矩阵
X_test = data[200000:225700,0:20] # 大测试集特征矩阵
Y_test = data[200000:225700,78] # 大测试集标签矩阵
print("大测试集X_test:",X_test.shape)
print("大测试集Y_test:",Y_test.shape)
'''模型融合中使用到的各个单模型'''
clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
'''生成0矩阵用来存放新的training set(10000*5),新的testing set(2500*5)'''
blend_train = np.zeros((X_train.shape[0], len(clfs)))
print("新训练集blend_train大小:",blend_train.shape)
blend_test = np.zeros((X_test.shape[0], len(clfs)))
print("新测试集blend_test大小:",blend_test.shape)
'''5折stacking'''
n_folds = 5
i=0
'''list() 方法用于将元组转换为列表。注:元组与列表是非常类似的,区别在于元组的元素值不能修改,元组是放在括号中,列表是放于方括号中。'''
skf = StratifiedKFold(n_folds,shuffle=True)
print(skf)
for j, clf in enumerate(clfs):
'''依次训练各个单模型'''
print("模型",j, clf)
# blend_test_j相当于分析中的【b1,b2,b3,b4,b5】
# 大测试集行数x折叠次数,在稍后采用预测的平均值
blend_test_j = np.zeros((X_test.shape[0], n_folds))
print("储存【b1,b2,b3,b4,b5】的blend_test_j大小:",blend_test_j.shape)
for train_index, test_index in skf.split(X_train, Y_train):
'''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
print("第",i,"折")
# 生成小train,小test。X为样本(行)+特征(列)矩阵,y为标签向量
print("小TRAIN行编号:", train_index, "小TEST行编号:", test_index)
x_smatrain, y_smatrain, x_smatest, y_smatest = X_train[train_index], Y_train[train_index], X_train[test_index], Y_train[test_index]
print("生成的小train大小:",x_smatrain.shape)
print("生成的小test大小:",x_smatest.shape )
# 内循环,对小test进行预测,一个模型预测5次,组合得到【a1,a2,a3,a4,a5】(转置形成A1(10000*1)),就是这里的blend_train的1列。
'''循环完毕(5个模型*5次折叠),新的训练集blend_train=【A1,A2,A3,A4,A5】(10000*5)'''
clf.fit(x_smatrain, y_smatrain)
# 对小test进行预测,结果保存在新的训练集中,行号对应相应的小test的行号
blend_train[test_index, j] = clf.predict(x_smatest)
print("小test的预测结果矩阵a",i,"大小:",clf.predict(x_smatest).shape)
print("a", i, clf.predict(x_smatest))
print("模型",j,"对小test的预测结果矩阵blend_train 【A1,A2,A3,A4,A5】:", blend_train)
# 对大test进行预测,得到[b1,b2,b3,b4,b5]
blend_test_j[:, i] = clf.predict(X_test)
print("b",i,clf.predict(X_test))
print("模型",j,"对大test的预测结果矩阵blend_test_j [b1,b2,b3,b4,b5]:",blend_test_j)
i = (i + 1) % 5
'''取b1,b2,b3,b4,b5的均值形成Bj(外循环完毕形成新的测试集blend_test=【B1,B2,B3,B4,B5】(2500*5))'''
blend_test[:, j] = np.mean(blend_test_j, axis = 1)
print("大test的预测结果矩阵[B1,B2,B3,B4,B5]:",blend_test)
# clf = LogisticRegression()
# 接着用blend_train, Y_dev去训练第二层的学习器LogisticRegression
bclf = LogisticRegression()
bclf.fit(blend_train, Y_train)
# 用最终的强学习器进行预测
Y_test_predict = bclf.predict(blend_test)
score = metrics.accuracy_score(Y_test, Y_test_predict)
print("Accuracy = %s" % (score))
转载:https://blog.csdn.net/Longtermevolution/article/details/101029002
查看评论