任务
通过某新区供水管网的历史压力数据、天气数据和供水管网互通图,预测未来某时间点的压力数据。
数据
主办方提供某新区供水管网数据,数据划分如下:
训练集:2018至2019年的30个压力监测点近两年的压力数据、2018年至2019年的天气数据,以及标明了30个压力监测点位置的供水管网互通图。
测试集:以下4段时间的每小时的压力数据、每天的天气数据,需要分别去预测对应日期每小时的压力数据。
具体数据字段描述如下:
(1)压力数据
(2)气象数据
总体思路如下:
- 把原本为列名的小时改成Hour字段,做行的条件分类字段
- 处理异常数据
- 获取日期和时间的特征
- 划分训练集和测试集的月份
- 最终是使用id和时间日期的特征去预测每个管网的压力值
- 使用lgb进行预测
代码如下:
import gc
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
train2018 = pd.read_csv('train_水压数据_2018.csv', engine='python')
train2019 = pd.read_csv('train_水压数据_2019.csv', engine='python')
test2020 = pd.read_csv('test_水压数据_2020.csv', engine='python')
test = pd.read_csv('to_predict.csv', engine='python')
submit = pd.read_csv('submit.csv', engine='python')
'''
通过某新区供水管网的历史压力数据、天气数据和供水管网互通图,预测未来某时间点的压力数据。
训练集:2018至2019年的30个压力监测点近两年的压力数据、2018年至2019年的天气数据,以及标明了30个压力监测点位置的供水管网互通图。
测试集:以下4段时间的每小时的压力数据、每天的天气数据,需要分别去预测对应日期每小时的压力数据。
相邻的管道结合起来建模
'''
# 把原本为列名的小时改成Hour字段,做行的条件分类字段
def reshape_data(df1):
time = df1["Time"].values
meas = df1["MeasName"].values
df_list = []
for i in range(0, 24):
hour = 'H' + str(i)
pressure = df1[hour].values
df2 = pd.DataFrame()
df2["Time"] = time
df2["MeasName"] = meas
df2["Hour"] = hour
df2["pressure"] = pressure
df_list.append(df2)
df3 = pd.concat(df_list)
df3.sort_values(by=['Time', 'MeasName'], inplace=True)
df3 = df3.reset_index(drop=True)
return df3
train2018 = reshape_data(train2018)
train2019 = reshape_data(train2019)
test2020 = reshape_data(test2020)
train2018['Time_time'] = pd.to_datetime(train2018['Time'])
train2019['Time_time'] = pd.to_datetime(train2019['Time'])
test2020['Time_time'] = pd.to_datetime(test2020['Time'])
test['Time_time'] = pd.to_datetime(test['Time'])
def abnormal(df):
# 处理-9999异常值: 填充为nan
index_value = list(df[df['pressure'] == -99999].index)
for i in index_value:
df.loc[i, 'pressure'] = np.nan
# 把压力值小于0.1的数据设为nan
index_value = list(df[df['pressure'] < 0.1].index)
for i in index_value:
df.loc[i, 'pressure'] = np.nan
# 把压力值大于0.5的数据设为nan
index_value = list(df[df['pressure'] > 0.5].index)
for i in index_value:
df.loc[i, 'pressure'] = np.nan
return df.dropna()
train2018 = abnormal(train2018)
train2019 = abnormal(train2019)
test2020 = abnormal(test2020)
def feature1(df):
df['Day'] = df['Time'].apply(lambda x: int(x.split('-')[-1]))
df['Hour'] = df['Hour'].apply(lambda x: int(x.replace('H', '')))
df['MeasName'] = df['MeasName'].apply(lambda x: int(x.replace('站点', '')))
return df
train2018 = feature1(train2018)
train2019 = feature1(train2019)
test2020 = feature1(test2020)
test = feature1(test)
train2019Mon2 = train2019[(train2019['Time_time'] >= '2019-2-1') & (train2019['Time_time'] <= '2019-2-28')]
train2019Mon1 = train2019[(train2019['Time_time'] >= '2019-1-1') & (train2019['Time_time'] <= '2019-1-28')]
Mon_2_1_2019 = train2019Mon2['pressure'].mean() - train2019Mon1['pressure'].mean()
train1 = test2020[(test2020['Time_time'] >= '2020-1-1') & (test2020['Time_time'] <= '2020-1-31')]
test1 = test[(test['Time_time'] >= '2020-2-3') & (test['Time_time'] <= '2020-2-16')]
used_feat = [f for f in train1.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)
train_x = train1[used_feat]
train_y = train1['pressure']
test_x = test1[used_feat]
print(train_x.shape, test_x.shape)
scores = []
params = {
'learning_rate': 0.05,
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'metric': 'mae',
'min_child_samples': 46,
'min_child_weight': 0.01,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 2,
'num_leaves': 26,
'max_depth': 9,
'seed': 2019,
'verbosity': -1,
}
oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
print('fold ', fold + 1)
x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
val_idx]
train_set = lgb.Dataset(x_trn, y_trn)
val_set = lgb.Dataset(x_val, y_val)
model = lgb.train(params, train_set, num_boost_round=5000,
valid_sets=(train_set, val_set), early_stopping_rounds=90,
verbose_eval=50)
oof_train[val_idx] += model.predict(x_val) / len(seeds)
preds += model.predict(test_x) / folds / len(seeds)
del x_trn, y_trn, x_val, y_val, model, train_set, val_set
gc.collect()
mse = (mean_squared_error(oof_train, train1['pressure']))
print('-' * 120)
print('mse ', round(mse, 5))
test1_pre = preds + Mon_2_1_2019
test1.loc[:, 'pressure'] = test1_pre.tolist()
####分段2
train2019Mon4 = train2019[(train2019['Time_time'] >= '2019-4-1') & (train2019['Time_time'] <= '2019-4-30')]
train2019Mon3 = train2019[(train2019['Time_time'] >= '2019-3-1') & (train2019['Time_time'] <= '2019-3-30')]
Mon_4_3_2019 = train2019Mon4['pressure'].mean() - train2019Mon3['pressure'].mean()
train2 = test2020[(test2020['Time_time'] >= '2020-3-1') & (test2020['Time_time'] <= '2020-3-31')]
test2 = test[(test['Time_time'] >= '2020-4-6') & (test['Time_time'] <= '2020-4-19')]
used_feat = [f for f in train2.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)
train_x = train2[used_feat]
train_y = train2['pressure']
test_x = test2[used_feat]
print(train_x.shape, test_x.shape)
oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
print('fold ', fold + 1)
x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
val_idx]
train_set = lgb.Dataset(x_trn, y_trn)
val_set = lgb.Dataset(x_val, y_val)
model = lgb.train(params, train_set, num_boost_round=5000,
valid_sets=(train_set, val_set), early_stopping_rounds=90,
verbose_eval=50)
oof_train[val_idx] += model.predict(x_val) / len(seeds)
preds += model.predict(test_x) / folds / len(seeds)
del x_trn, y_trn, x_val, y_val, model, train_set, val_set
gc.collect()
mse = (mean_squared_error(oof_train, train2['pressure']))
print('-' * 120)
print('mse ', round(mse, 5))
test2_pre = preds + Mon_2_1_2019
test2.loc[:, 'pressure'] = test2_pre.tolist()
####分段3
train2019Mon6 = train2019[(train2019['Time_time'] >= '2019-6-1') & (train2019['Time_time'] <= '2019-6-30')]
train2019Mon5 = train2019[(train2019['Time_time'] >= '2019-5-1') & (train2019['Time_time'] <= '2019-5-30')]
Mon_6_5_2019 = train2019Mon6['pressure'].mean() - train2019Mon5['pressure'].mean()
train3 = test2020[(test2020['Time_time'] >= '2020-5-1') & (test2020['Time_time'] <= '2020-5-31')]
test3 = test[(test['Time_time'] >= '2020-6-1') & (test['Time_time'] <= '2020-6-14')]
used_feat = [f for f in train3.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)
train_x = train3[used_feat]
train_y = train3['pressure']
test_x = test3[used_feat]
print(train_x.shape, test_x.shape)
oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
print('fold ', fold + 1)
x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
val_idx]
train_set = lgb.Dataset(x_trn, y_trn)
val_set = lgb.Dataset(x_val, y_val)
model = lgb.train(params, train_set, num_boost_round=5000,
valid_sets=(train_set, val_set), early_stopping_rounds=90,
verbose_eval=50)
oof_train[val_idx] += model.predict(x_val) / len(seeds)
preds += model.predict(test_x) / folds / len(seeds)
del x_trn, y_trn, x_val, y_val, model, train_set, val_set
gc.collect()
mse = (mean_squared_error(oof_train, train3['pressure']))
print('-' * 120)
print('mse ', round(mse, 5))
pre = preds + Mon_2_1_2019
test3.loc[:, 'pressure'] = pre.tolist()
###分段4
train2019Mon9 = train2019[(train2019['Time_time'] >= '2019-9-1') & (train2019['Time_time'] <= '2019-9-30')]
train2019Mon8 = train2019[(train2019['Time_time'] >= '2019-8-1') & (train2019['Time_time'] <= '2019-8-30')]
Mon_9_8_2019 = train2019Mon9['pressure'].mean() - train2019Mon8['pressure'].mean()
train4 = test2020[(test2020['Time_time'] >= '2020-8-1') & (test2020['Time_time'] <= '2020-8-31')]
test4 = test[(test['Time_time'] >= '2020-9-7') & (test['Time_time'] <= '2020-9-20')]
used_feat = [f for f in train4.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)
train_x = train4[used_feat]
train_y = train4['pressure']
test_x = test4[used_feat]
print(train_x.shape, test_x.shape)
oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
print('fold ', fold + 1)
x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
val_idx]
train_set = lgb.Dataset(x_trn, y_trn)
val_set = lgb.Dataset(x_val, y_val)
model = lgb.train(params, train_set, num_boost_round=5000,
valid_sets=(train_set, val_set), early_stopping_rounds=90,
verbose_eval=50)
oof_train[val_idx] += model.predict(x_val) / len(seeds)
preds += model.predict(test_x) / folds / len(seeds)
del x_trn, y_trn, x_val, y_val, model, train_set, val_set
gc.collect()
mse = (mean_squared_error(oof_train, train4['pressure']))
print('-' * 120)
print('mse ', round(mse, 5))
pre = preds + Mon_2_1_2019
np.savetxt('pre4.csv', pre)
# test4.loc[:, 'pressure'] = pre.tolist()
test4.to_csv('test4.csv')
test4 = pd.read_csv('test4.csv')
pre = pd.read_csv('pre4.csv', header=None)
test4.loc[:, 'pressure'] = pre.values.tolist()
test = pd.concat([test1, test2, test3, test4], axis=0)
test[['id', 'pressure']].to_csv('lgb_5000.csv', index=False)
转载:https://blog.csdn.net/qq_30803353/article/details/113741311
查看评论