首页 > 其他 > 详细

sklearn之验证曲线

时间:2019-07-16 13:02:20      阅读:81      评论:0      收藏:0      [点我收藏+]
‘‘‘
    验证曲线:模型性能 = f(超参数)----通过优化模型参数,达到优化模型预测结果,使得模型预测更加精准
    验证曲线所需API:
            train_scores, test_scores = ms.validation_curve(
            model,        # 模型
            输入集, 输出集,
            ‘n_estimators‘,         #超参数名
            np.arange(50, 550, 50),    #超参数序列
            cv=5        #折叠数
            )

            train_scores的结构:
                超参数取值    第一次折叠    第二次折叠    第三次折叠    第四次折叠    第五次折叠
                        50    0.91823444    0.91968162    0.92619392    0.91244573    0.91040462
                        100    0.91968162    0.91823444    0.91244573    0.92619392    0.91244573
                        ...           ...            ...         ...            ...            ...
            test_scores的结构与train_scores的结构相同。

    案例:在小汽车评级案例中使用验证曲线选择较优参数。

‘‘‘

import numpy as np
import matplotlib.pyplot as mp
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import sklearn.metrics as sm
import warnings

warnings.filterwarnings(ignore)

data = []
with open(./ml_data/car.txt, r) as f:
    for line in f.readlines():
        sample = line[:-1].split(,)
        data.append(sample)
data = np.array(data)
# print(data.shape)

# 整理好每一列的标签编码器encoders
# 整理好训练输入集与输出集
data = data.T
# print(data.shape)
encoders = []
train_x, train_y = [], []
for row in range(len(data)):
    encoder = sp.LabelEncoder()
    if row < len(data) - 1:  # 不是最后列
        train_x.append(encoder.fit_transform(data[row]))
    else:  # 是最后一列,作为输出集
        train_y = encoder.fit_transform(data[row])
    encoders.append(encoder)

train_x = np.array(train_x).T
# 训练随机森林分类器
model = se.RandomForestClassifier(max_depth=6, n_estimators=150, random_state=7)

# 获取n_estimators的验证曲线
train_scores, test_scores = ms.validation_curve(model, train_x, train_y, n_estimators, np.arange(50, 550, 50), cv=5)
# print(‘test_scores‘, test_scores, sep=‘\n‘)
# print(np.mean(test_scores, axis=1))

# 获取max_depth的验证曲线
train_scores1, test_scores1 = ms.validation_curve(model, train_x, train_y, max_depth, np.arange(1, 7), cv=5)
# print(‘test_scores1‘, test_scores1, sep=‘\n‘)
# print(np.mean(test_scores1, axis=1))

# 训练之前进行交叉验证
cv = ms.cross_val_score(model, train_x, train_y, cv=4, scoring=f1_weighted)
# print(cv.mean())
model.fit(train_x, train_y)

# 自定义测试集,预测小汽车的等级
# 保证每个特征使用的标签编码器与训练时使用的标签编码器匹配
data = [
    [high, med, 5more, 4, big, low, unacc],
    [high, high, 4, 4, med, med, acc],
    [low, low, 2, 4, small, high, good],
    [low, med, 3, 4, med, high, vgood]]

data = np.array(data).T
test_x, test_y = [], []
for row in range(len(data)):
    encoder = encoders[row]  # 每列对应的标签编码器
    if row < len(data) - 1:
        test_x.append(encoder.transform(data[row]))  # 这里需要训练了,直接转换
    else:
        test_y = encoder.transform(data[row])
test_x = np.array(test_x).T

pred_test_y = model.predict(test_x)
print(pred_test_y)
pred_test_y = encoders[-1].inverse_transform(pred_test_y)
test_y = encoders[-1].inverse_transform(test_y)
# print(pred_test_y)
# print(test_y)

# 画图显示验证曲线
mp.figure(Validation Curve, facecolor=lightgray)
mp.subplot(211)
mp.title(N_estimators)
mp.xlabel(N_estimators)
mp.ylabel(f1 score)
mp.grid(linestyle=":")
mp.plot(np.arange(50, 550, 50), np.mean(test_scores, axis=1), label=N_estimators VC)
mp.legend()

mp.subplot(212)
mp.title(Max_depth)
mp.xlabel(Max_depth)
mp.ylabel(f1 score)
mp.grid(linestyle=":")
mp.plot(np.arange(1, 7), np.mean(test_scores1, axis=1), label=Max_depth VC)
mp.legend()

mp.show()



输出结果:

[2 0 0 3]
[unacc acc acc vgood]
[unacc acc good vgood]

  技术分享图片

sklearn之验证曲线

原文:https://www.cnblogs.com/yuxiangyang/p/11194151.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!