今天介绍一个机器学习包,sklearn。其功能模块有regression\classification\clustering\Dimensionality reduction\data preprocessing\model selection
对我来说,常用的主要有regression(SVR)和classification(SVC)两个部分。
首先介绍一下用sklearn.svm.SVR来做回归,如下:
1)多元线性回归
import numpy as np from sklearn.linear_model import LinearRegression rng = np.random.RandomState(10) # 设置随机局部种子 x = 100 * rng.rand(50, 3) # 设置一个50行3列 所有值乘100的随机矩阵 x1 = x[:, 0] x1.shape = 50, 1 x2 = x[:, 1] x2.shape = 50, 1 x3 = x[:, 2] x3.shape = 50, 1 y = 1.25 * x1 + 2.5 * x2 + 3 * x3 + 10 + rng.randn(50, 1) # randn是标准正态分布,用于核验结果 model = LinearRegression(fit_intercept=True) model.fit(x, y) a = np.linspace(0, 50, 1000) # 从0到50创建1000个等差数列,验证模型 x1_fit = a[:, np.newaxis] # 将a转置成列 x2_fit = a[:, np.newaxis] x3_fit = a[:, np.newaxis] x_fit = np.hstack((x1_fit, x2_fit, x3_fit)) # 将x1,x2,x3合并一起 y_fit = model.predict(x_fit) # 对y预测 print("Model slope: ", model.coef_[0]) print("Model intercept:", model.intercept_) print(‘方程的判定系数(R^2): %.2f‘ % model.score(x, y)) #计算得分,R^2
2)多项式回归
import random import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures x_data, y_data = [], [] # 随机生成30个点 for x in range(-10, 20): y = - x ** 2 + 5 * x - 10 + random.random() * 20 x_data.append([x]) y_data.append([y]) # 特征构造 poly_reg = PolynomialFeatures(degree=2) #多项式构造 x_poly = poly_reg.fit_transform(x_data) # 创建线性模型 linear_reg = LinearRegression() linear_reg.fit(x_poly, y_data) plt.plot(x_data, y_data, ‘b.‘) # 用特征构造数据进行预测 plt.plot(x_data, linear_reg.predict(poly_reg.fit_transform(x_data)), ‘r‘) plt.show()
3)非线性回归(一元为例)
from sklearn.svm import SVR from sklearn.model_selection import GridSearchCV #自动选择最佳模型 from sklearn.tree import DecisionTreeRegressor #决策树 from sklearn.ensemble import RandomForestRegressor #随机森林 import numpy as np import matplotlib.pyplot as plt x = np.array([68.67,54.351,92.991,80.39,64.46]).reshape(-1, 1) #reshape为(-1,1),里面是[[1],[2]...] y = np.array([68.67,54.351,92.991,80.39,64.46]).reshape(-1, 1) # 选择模型 #model = SVR(kernel=‘rbf‘) # model = DecisionTreeRegressor() # model = RandomForestRegressor() model = GridSearchCV(SVR(), param_grid={"kernel": ("linear", ‘rbf‘, ‘sigmoid‘), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}) model.fit(x, y) xneed = np.arrray([[1.2],[3.6]]) y_pre = model.predict(xneed)# 进行预测 plt.scatter(x, y, c=‘k‘, label=‘data‘, zorder=1) plt.plot(xneed, y_pre, c=‘r‘, label=‘SVR_fit‘) plt.show() print(model.best_params_)
补充:
1.如果要划分训练样本和测试样本数据集。
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3) #选取0.3的测试集
2.为了增强数据之间相关性,通常对数据进行预处理,如标准化。
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_std = scaler.fit_transform(x) # 标准化
3.可以用GridSearchCV自动选择最佳模型
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)
4.模型保存
from sklearn.externals import joblib #用于保存和读取模型pkl
joblib.dump(model, ‘svr.pkl‘) # 保存模型
svr = joblib.load(‘svr.pkl‘) # 读取模型
过两天补充一下sklearn.svm.SVC...
原文:https://www.cnblogs.com/ljwgis/p/11979021.html