赛题以预测用户贷款是否违约为任务,该数据来自某信贷平台的贷款记录,总数据量超过120w,包含47列变量信息,其中15列为匿名变量。从中抽取80万条作为训练集,20万条作为测试集A,20万条作为测试集B,同时会对employmentTitle、purpose、postCode和title等信息进行脱敏。
详细介绍请移步天池:贷款违约预测
详细思路:
查看缺失值情况
按照平均数填充数值型特征
按照众数填充类别型特征
通过对数函数映射到指数宽度分箱
高维类别特征需要进行转换
选择K个最好的特征,返回选择特征后的数据
选择XGB开始训练
调参经历:
XGB稍微慢,但是效果比LGB好点
分数记录:
特征工程和训练代码如下:
import datetime
import os
import warnings
import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from utils.read_write import reduce_mem_usage
warnings.filterwarnings('ignore')
os.chdir(r'E:\项目文件\贷款违约')
data_test_a = pd.read_csv('testA.csv')
data = pd.read_csv('train.csv')
data_train = reduce_mem_usage(data)
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea, list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)
# 查看缺失值情况
data_train.isnull().sum()
# 按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
# 按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
data_train.isnull().sum()
# 转化成时间格式
for data in [data_train, data_test_a]:
data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 构造时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x - startdate).dt.days
data_train['employmentLength'].value_counts(dropna=False).sort_index()
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
data_train['earliesCreditLine'].sample(5)
for data in [data_train, data_test_a]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode',
'regionCode', 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f, '类型数:', data[f].nunique())
for data in [data_train, data_test_a]:
data['grade'] = data['grade'].map({
'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7})
# 类型数在2之上,又不是高维稀疏的,且纯分类特征
for data in [data_train, data_test_a]:
data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'],
drop_first=True)
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
for col in ['grade', 'subGrade']:
temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(
columns={
'mean': col + '_target_mean'})
temp_dict.index = temp_dict[col].values
temp_dict = temp_dict[col + '_target_mean'].to_dict()
data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
for item in ['n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']:
df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
# label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title', 'subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
data_train[col] = le.transform(list(data_train[col].astype(str).values))
data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
features = [f for f in data_train.columns if f not in ['id', 'issueDate', 'isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']
#
# 举例归一化过程
# 伪代码
for fea in ['employmentTitle', 'postCode', 'title']:
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
# 选择K个最好的特征,返回选择特征后的数据
# 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
# 输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
# 参数k为选择的特征个数
# x_train.to_csv('x_train.csv', index=False)
# x_test.to_csv('x_test.csv', index=False)
# y_train.to_csv('y_train.csv', index=False)
# x_train = pdReadCsv('x_train.csv', ',')
# x_test = pdReadCsv('x_test.csv', ',')
# y_train = pdReadCsv('y_train.csv', ',')
# x_train = SelectKBest(k=35).fit_transform(x_train, y_train)
# # 当然也可以直接看图
# data_numeric = data_train[numerical_fea]
# correlation = data_numeric.corr()
#
# f, ax = plt.subplots(figsize=(7, 7))
# plt.title('Correlation of Numeric Features with Price', y=1, size=16)
# sns.heatmap(correlation, square=True, vmax=0.8)
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i + 1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 25,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2020,
'nthread': 28,
'verbose': -1,
}
model = clf.train(params, train_matrix, 5000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,
early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
train_matrix = clf.DMatrix(trn_x, label=trn_y)
valid_matrix = clf.DMatrix(val_x, label=val_y)
test_matrix = clf.DMatrix(test_x)
params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': 2020,
'nthread': 36,
}
watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
model = clf.train(params, train_matrix, num_boost_round=5000, evals=watchlist, verbose_eval=200,
early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
if clf_name == "cat":
params = {
'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=2000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
train[valid_index] = val_pred
test = test_pred / kf.n_splits
print(roc_auc_score(val_y, val_pred))
return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
return cat_train, cat_test
# lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
# print(lgb_test)
# np.savetxt('X_test_pre_lgb.csv', lgb_test)
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
print(xgb_test)
np.savetxt('X_test_pre_xgb.csv', xgb_test)
# cat_train, cat_test = cat_model(x_train, y_train, x_test)
# print(cat_test)
# np.savetxt('X_test_pre_cat.csv', cat_test)
# testA_result = pd.read_csv('../testA_result.csv')
#
# roc_auc_score(testA_result['isDefault'].values, lgb_test)
减少内存的方法如下:
# reduce_mem_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} kB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} kB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
转载:https://blog.csdn.net/qq_30803353/article/details/109556415
查看评论