虽然当时候设想这篇文章的重点在于outlier treatment异常值处理,但是读了下来貌似作者的重点在xgb和lgb的调优和组合参数的调优上面。
对于异常值处理貌似简单的threshold就带过了。不过即使这样,本文也有很多值得学的东西。
按照原文顺序来盘点值得学习的地方和一些操作。
prop = pd.read_csv(‘../input/properties_2016.csv‘)
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
个人觉得也可以
prop[prop.columns.dtype==np.float64] = prop[prop.columns.dtype==np.float64].astype(np.float32)
df_train.fillna(df_train.median(),inplace = True)
import lightgbm as lgb
lgb的数据集结构
d_train = lgb.Dataset(x_train, label=y_train)
hyperparameter
params = {}
params[‘max_bin‘] = 10
params[‘learning_rate‘] = 0.0021 # shrinkage_rate
params[‘boosting_type‘] = ‘gbdt‘
params[‘objective‘] = ‘regression‘
params[‘metric‘] = ‘l1‘ # or ‘mae‘
params[‘sub_feature‘] = 0.5 # feature_fraction -- OK, back to .5, but maybe later increase this
params[‘bagging_fraction‘] = 0.85 # sub_row
params[‘bagging_freq‘] = 40
params[‘num_leaves‘] = 512 # num_leaf
params[‘min_data‘] = 500 # min_data_in_leaf
params[‘min_hessian‘] = 0.05 # min_sum_hessian_in_leaf
params[‘verbose‘] = 0
训练
clf = lgb.train(params, d_train, 430)
del d_train; gc.collect()
del x_train; gc.collect()
测试
# num_threads > 1 will predict very slow in kernal
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
del x_test; gc.collect()
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.418 ]
import xgboost as xgb
设置参数
# xgboost params
xgb_params = {
‘eta‘: 0.037,
‘max_depth‘: 5,
‘subsample‘: 0.80,
‘objective‘: ‘reg:linear‘,
‘eval_metric‘: ‘mae‘,
‘lambda‘: 0.8,
‘alpha‘: 0.4,
‘base_score‘: y_mean,
‘silent‘: 1
}
xgb的数据结构
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
训练+测试
# num_boost_rounds = 150
num_boost_rounds = 242
print("\nXGBoost tuned with CV in:")
print(" https://www.kaggle.com/aharless/xgboost-without-outliers-tweak ")
print("num_boost_rounds="+str(num_boost_rounds))
# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
print( "\nPredicting with XGBoost ...")
xgb_pred = model.predict(dtest)
print( "\nXGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )
xgb+lgb+naive(0.115)模型
# Parameters
XGB_WEIGHT = 0.6500
BASELINE_WEIGHT = 0.0056
BASELINE_PRED = 0.0115
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT
pred = XGB_WEIGHT*xgb_pred + BASELINE_WEIGHT*BASELINE_PRED + lgb_weight*p_test
EDA中级-kaggle学习5-XGB+LGB简单ensemble
原文:https://www.cnblogs.com/niemand-01/p/14379550.html