首页 > 编程语言 > 详细

机器学习算法,机器让我学习(2)

时间:2019-03-22 23:10:53      阅读:274      评论:0      收藏:0      [点我收藏+]

这个主要是线性回归和逻辑回归部分,除了前面关于最小二乘法,后面基本都看不懂,只做了记录。

  二维线性模型:普通最小二乘法:

    技术分享图片

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import matplotlib.pyplot as plt
 4 from scipy.optimize import minimize
 5 
 6 # For reproducibility
 7 np.random.seed(1000)
 8 # Number of samples
 9 nb_samples = 200
10 
11 def loss(v):
12     e = 0.0
13     for i in range(nb_samples):
14         e += np.square(v[0] + v[1]*X[i] - Y[i])
15     return 0.5 * e
16 
17 def gradient(v):
18     g = np.zeros(shape=2)
19     for i in range(nb_samples):
20         g[0] += (v[0] + v[1]*X[i] - Y[i])
21         g[1] += ((v[0] + v[1]*X[i] - Y[i]) * X[i])
22     return g
23 
24 def show_dataset(X, Y):
25     fig, ax = plt.subplots(1, 1, figsize=(5, 5))
26     ax.scatter(X, Y)
27     ax.set_xlabel(X)
28     ax.set_ylabel(Y)
29     ax.grid()
30     plt.show()
31 
32 if __name__ == __main__:
33     # Create dataset
34     X = np.arange(-5, 5, 0.05)
35     Y = X + 2
36     Y += np.random.uniform(-0.5, 0.5, size=nb_samples)
37     
38     # Show the dataset
39     show_dataset(X, Y)
40 
41     # Minimize loss function
42     result = minimize(fun=loss, x0=np.array([0.0, 0.0]), jac=gradient, method=L-BFGS-B)
43 
44     print(Interpolating rect:)
45     print(y = %.2fx + %2.f % (result.x[1], result.x[0]))
46 
47     # Compute the absolute error
48     err = 0.0
49 
50     for i in range(nb_samples):
51         err += np.abs(Y[i] - (result.x[1]*X[i] + result.x[0]))
52 
53     print(Absolute error: %.2f % err)
View Code

    技术分享图片

  基于scikit-learn的线性回归和更高维

    利用k折交叉验证完成测试:

      技术分享图片

    在交叉验证中,使用scoring=‘r2‘时,R2接近于1效果好,接近0模型差。R2=1-(xxx/XXXXX)

技术分享图片
 1 from __future__  import print_function
 2 import numpy as np
 3 import matplotlib.pyplot as plt
 4 from sklearn.datasets import load_boston
 5 from sklearn.linear_model import LinearRegression
 6 from sklearn.model_selection import train_test_split, cross_val_score
 7 # For reproducibility
 8 np.random.seed(1000)
 9 def show_dataset(data):
10     fig, ax = plt.subplots(4, 3, figsize=(20, 15))
11     for i in range(4):
12         for j in range(3):
13             ax[i, j].plot(data.data[:, i + (j + 1) * 3])
14             ax[i, j].grid()
15     plt.show()
16 
17 if __name__ == __main__:
18     # Load dataset
19     boston = load_boston()
20     # Show dataset
21     show_dataset(boston)
22     # Create a linear regressor instance
23     lr = LinearRegression(normalize=True)
24     # Split dataset
25     X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.1)
26     # Train the model
27     lr.fit(X_train, Y_train)
28     print(Score %.3f % lr.score(X_test, Y_test))
29     # CV score  k折交叉验证,负均方差验证
30     scores = cross_val_score(lr, boston.data, boston.target, cv=7, scoring=neg_mean_squared_error)
31     print(CV Negative mean squared errors mean: %.3f % scores.mean())
32     print(CV Negative mean squared errors std: %.3f % scores.std())
33     # CV R2 score   k折交叉验证,决定系数验证
34     r2_scores = cross_val_score(lr, boston.data, boston.target, cv=10, scoring=r2)
35     print(CV R2 score: %.3f % r2_scores.mean())
View Code

    技术分享图片

     求得的回归表达式和结果:(不理想的结果)

技术分享图片
1 print(y=+ str(lr.intercept_)+ )
2 for i,c in enumerate(lr.coef_):
3     print(str(c)+*x+str(i))
4 X = boston.data[0:10]+np.random.normal(0.0,0.1)
5 lr.predict(X)
View Code

      boston.target[0:10]

       技术分享图片

  Ridge、Lasso、ElasticNet回归:

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 from sklearn.datasets import load_diabetes
 4 from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
 5 from sklearn.model_selection import cross_val_score
 6 # For reproducibility
 7 np.random.seed(1000)
 8 if __name__ == __main__:
 9     diabetes = load_diabetes()
10     # Create a linear regressor and compute CV score
11     lr = LinearRegression(normalize=True)
12     lr_scores = cross_val_score(lr, diabetes.data, diabetes.target, cv=10)
13     print(Linear regression CV score: %.6f % lr_scores.mean())
14     # Create a Ridge regressor and compute CV score
15     rg = Ridge(0.005, normalize=True)
16     rg_scores = cross_val_score(rg, diabetes.data, diabetes.target, cv=10)
17     print(Ridge regression CV score: %.6f % rg_scores.mean())
18     # Create a Lasso regressor and compute CV score
19     ls = Lasso(0.01, normalize=True)
20     ls_scores = cross_val_score(ls, diabetes.data, diabetes.target, cv=10)
21     print(Lasso regression CV score: %.6f % ls_scores.mean())
22     # Create ElasticNet regressor and compute CV score
23     en = ElasticNet(alpha=0.001, l1_ratio=0.8, normalize=True)
24     en_scores = cross_val_score(en, diabetes.data, diabetes.target, cv=10)
25     print(ElasticNet regression CV score: %.6f % en_scores.mean())
26     
27     # Find the optimal alpha value for Ridge regression
28     rgcv = RidgeCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True)
29     rgcv.fit(diabetes.data, diabetes.target)
30     print(Ridge optimal alpha: %.3f % rgcv.alpha_)
31     # Find the optimal alpha value for Lasso regression
32     lscv = LassoCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True)
33     lscv.fit(diabetes.data, diabetes.target)
34     print(Lasso optimal alpha: %.3f % lscv.alpha_)
35     # Find the optimal alpha and l1_ratio for Elastic Net
36     encv = ElasticNetCV(alphas=(0.1, 0.01, 0.005, 0.0025, 0.001), l1_ratio=(0.1, 0.25, 0.5, 0.75, 0.8), normalize=True)
37     encv.fit(diabetes.data, diabetes.target)
38     print(ElasticNet optimal alpha: %.3f and L1 ratio: %.4f % (encv.alpha_, encv.l1_ratio_))
View Code

技术分享图片

  随机采样一致的鲁棒回归

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import matplotlib.pyplot as plt
 4 from sklearn.linear_model import LinearRegression, RANSACRegressor
 5 # For reproducibility
 6 np.random.seed(1000)
 7 nb_samples = 200
 8 nb_noise_samples = 150
 9 
10 def show_dataset(X, Y):
11     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
12     ax.scatter(X, Y)
13     ax.set_xlabel(X)
14     ax.set_ylabel(Y)
15     ax.grid()
16     plt.show()
17 
18 if __name__ == __main__:
19     # Create dataset
20     X = np.arange(-5, 5, 0.05)
21     Y = X + 2
22     Y += np.random.uniform(-0.5, 0.5, size=nb_samples)
23     for i in range(nb_noise_samples, nb_samples):
24         Y[i] += np.random.uniform(12, 15)
25     # Show the dataset
26     show_dataset(X, Y)
27     # Create a linear regressor
28     lr = LinearRegression(normalize=True)
29     lr.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
30     print(Standard regressor: y = %.3fx + %.3f % (lr.coef_, lr.intercept_))
31     # Create RANSAC regressor
32     rs = RANSACRegressor(lr)
33     rs.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
34     print(RANSAC regressor: y = %.3fx + %.3f % (rs.estimator_.coef_, rs.estimator_.intercept_))
View Code

技术分享图片

  多项式回归

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import matplotlib.pyplot as plt
 4 from sklearn.linear_model import LinearRegression
 5 from sklearn.model_selection import train_test_split
 6 from sklearn.preprocessing import PolynomialFeatures
 7 # For reproducibility
 8 np.random.seed(1000)
 9 nb_samples = 200
10 def show_dataset(X, Y):
11     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
12     ax.scatter(X, Y)
13     ax.set_xlabel(X)
14     ax.set_ylabel(Y)
15     ax.grid()
16     plt.show()
17 
18 if __name__ == __main__:
19     # Create dataset
20     X = np.arange(-5, 5, 0.05)
21     Y = X + 2
22     Y += X**2 + np.random.uniform(-0.5, 0.5, size=nb_samples)
23     # Show the dataset
24     show_dataset(X, Y)
25     # Split dataset
26     X_train, X_test, Y_train, Y_test = train_test_split(X.reshape(-1, 1), Y.reshape(-1, 1), test_size=0.25)
27     lr = LinearRegression(normalize=True)
28     lr.fit(X_train, Y_train)
29     print(Linear regression score: %.3f % lr.score(X_train, Y_train))
30     # Create polynomial features
31     pf = PolynomialFeatures(degree=2)
32     X_train = pf.fit_transform(X_train)
33     X_test = pf.fit_transform(X_test)
34     lr.fit(X_train, Y_train)
35     print(Second degree polynomial regression score: %.3f % lr.score(X_train, Y_train))
View Code

技术分享图片

  保序回归

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import matplotlib.pyplot as plt
 4 from matplotlib.collections import LineCollection
 5 from sklearn.isotonic import IsotonicRegression
 6 # For reproducibility
 7 np.random.seed(1000)
 8 nb_samples = 100
 9 def show_dataset(X, Y):
10     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
11     ax.plot(X, Y, b.-)
12     ax.grid()
13     ax.set_xlabel(X)
14     ax.set_ylabel(Y)
15     plt.show()
16 
17 def show_isotonic_regression_segments(X, Y, Yi, segments):
18     lc = LineCollection(segments, zorder=0)
19     lc.set_array(np.ones(len(Y)))
20     lc.set_linewidths(0.5 * np.ones(nb_samples))
21     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
22     ax.plot(X, Y, b., markersize=8)
23     ax.plot(X, Yi, g.-, markersize=8)
24     ax.grid()
25     ax.set_xlabel(X)
26     ax.set_ylabel(Y)
27     plt.show()
28 
29 if __name__ == __main__:
30     # Create dataset
31     X = np.arange(-5, 5, 0.1)
32     Y = X + np.random.uniform(-0.5, 1, size=X.shape)
33     # Show original dataset
34     show_dataset(X, Y)
35     # Create an isotonic regressor
36     ir = IsotonicRegression(-6, 10)
37     Yi = ir.fit_transform(X, Y)
38     # Create a segment list
39     segments = [[[i, Y[i]], [i, Yi[i]]] for i in range(nb_samples)]
40     # Show isotonic interpolation
41     show_isotonic_regression_segments(X, Y, Yi, segments)
View Code

      技术分享图片

逻辑回归:

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import matplotlib.pyplot as plt
 4 from sklearn.datasets import make_classification
 5 from sklearn.model_selection import train_test_split, cross_val_score
 6 from sklearn.linear_model import LogisticRegression
 7 # For reproducibility
 8 np.random.seed(1000)
 9 nb_samples = 500
10 
11 def show_dataset(X, Y):
12     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
13     ax.grid()
14     ax.set_xlabel(X)
15     ax.set_ylabel(Y)
16     for i in range(nb_samples):
17         if Y[i] == 0:
18             ax.scatter(X[i, 0], X[i, 1], marker=o, color=r)
19         else:
20             ax.scatter(X[i, 0], X[i, 1], marker=^, color=b)
21     plt.show()
22 
23 def show_classification_areas(X, Y, lr):
24     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
25     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
26     xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
27     Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
28     Z = Z.reshape(xx.shape)
29     plt.figure(1, figsize=(10, 8))
30     plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1)
31     # Plot also the training points
32     plt.scatter(X[:, 0], X[:, 1], c=np.abs(Y - 1), edgecolors=k, cmap=plt.cm.coolwarm)
33     plt.xlabel(X)
34     plt.ylabel(Y)
35     plt.xlim(xx.min(), xx.max())
36     plt.ylim(yy.min(), yy.max())
37     plt.xticks(())
38     plt.yticks(())
39     plt.show()
40 
41 if __name__ == __main__:
42     # Create dataset
43     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
44                                n_clusters_per_class=1)
45     # Show dataset
46     show_dataset(X, Y)
47     # Split dataset
48     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
49     # Create logistic regressor
50     lr = LogisticRegression()
51     lr.fit(X_train, Y_train)
52     print(Logistic regression score: %.3f % lr.score(X_test, Y_test))
53     # Compute CV score
54     lr_scores = cross_val_score(lr, X, Y, scoring=accuracy, cv=10)
55     print(Logistic regression CV average score: %.3f % lr_scores.mean())
56     # Show classification areas
57     show_classification_areas(X, Y, lr)
View Code

技术分享图片

  随机梯度下降法

技术分享图片
 1 from __future__ import print_function
 2 
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 
 6 from sklearn.datasets import make_classification
 7 from sklearn.linear_model import SGDClassifier
 8 from sklearn.model_selection import cross_val_score
 9 
10 
11 # For reproducibility
12 np.random.seed(1000)
13 
14 nb_samples = 500
15 
16 def show_dataset(X, Y):
17     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
18 
19     ax.grid()
20     ax.set_xlabel(X)
21     ax.set_ylabel(Y)
22 
23     for i in range(nb_samples):
24         if Y[i] == 0:
25             ax.scatter(X[i, 0], X[i, 1], marker=o, color=r)
26         else:
27             ax.scatter(X[i, 0], X[i, 1], marker=^, color=b)
28 
29     plt.show()
30 
31 
32 if __name__ == __main__:
33     # Create dataset
34     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
35                                n_clusters_per_class=1)
36 
37     # Show dataset
38     show_dataset(X, Y)
39 
40     # Create perceptron as SGD instance
41     # The same result can be obtained using directly the class sklearn.linear_model.Perceptron
42     sgd = SGDClassifier(loss=perceptron, learning_rate=optimal, n_iter=10)
43     sgd_scores = cross_val_score(sgd, X, Y, scoring=accuracy, cv=10)
44     print(Perceptron CV average score: %.3f % sgd_scores.mean())
View Code

技术分享图片

  网格搜索找到最优超参数

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import multiprocessing
 4 from sklearn.datasets import load_iris
 5 from sklearn.model_selection import GridSearchCV, cross_val_score
 6 from sklearn.linear_model import LogisticRegression
 7 # For reproducibility
 8 np.random.seed(1000)
 9 if __name__ == __main__:
10     # Load dataset
11     iris = load_iris()
12 
13     # Define a param grid
14     param_grid = [
15         {
16             penalty: [l1, l2],
17             C: [0.5, 1.0, 1.5, 1.8, 2.0, 2.5]
18         }
19     ]
20     # Create and train a grid search
21     gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid,
22                       scoring=accuracy, cv=10, n_jobs=multiprocessing.cpu_count())
23     gs.fit(iris.data, iris.target)
24     # Best estimator
25     print(gs.best_estimator_)
26     gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring=accuracy, cv=10)
27     print(Best estimator CV average score: %.3f % gs_scores.mean())
View Code

技术分享图片

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 import multiprocessing
 4 from sklearn.datasets import load_iris
 5 from sklearn.model_selection import GridSearchCV, cross_val_score
 6 from sklearn.linear_model import SGDClassifier
 7 # For reproducibility
 8 np.random.seed(1000)
 9 if __name__ == __main__:
10     # Load dataset
11     iris = load_iris()
12 
13     # Define a param grid
14     param_grid = [
15         {
16             penalty: [l1, l2, elasticnet],
17             alpha: [1e-5, 1e-4, 5e-4, 1e-3, 2.3e-3, 5e-3, 1e-2],
18             l1_ratio: [0.01, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75, 0.8]
19         }
20     ]
21     # Create SGD classifier
22     sgd = SGDClassifier(loss=perceptron, learning_rate=optimal)
23     # Create and train a grid search
24     gs = GridSearchCV(estimator=sgd, param_grid=param_grid, scoring=accuracy, cv=10,
25                       n_jobs=multiprocessing.cpu_count())
26     gs.fit(iris.data, iris.target)
27     # Best estimator
28     print(gs.best_estimator_)
29     gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring=accuracy, cv=10)
30     print(Best estimator CV average score: %.3f % gs_scores.mean())
View Code

技术分享图片

  评估分类的指标:含混淆矩阵部分

技术分享图片
 1 from __future__ import print_function
 2 import numpy as np
 3 
 4 from sklearn.datasets import make_classification
 5 from sklearn.model_selection import train_test_split
 6 from sklearn.linear_model import LogisticRegression
 7 from sklearn.metrics import accuracy_score, zero_one_loss, jaccard_similarity_score, confusion_matrix,  8     precision_score, recall_score, fbeta_score
 9 # For reproducibility
10 np.random.seed(1000)
11 nb_samples = 500
12 
13 if __name__ == __main__:
14     # Create dataset
15     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
16                                n_clusters_per_class=1)
17 
18     # Split dataset
19     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
20     # Create and train logistic regressor
21     lr = LogisticRegression()
22     lr.fit(X_train, Y_train)
23     print(Accuracy score: %.3f % accuracy_score(Y_test, lr.predict(X_test)))
24     print(Zero-one loss (normalized): %.3f % zero_one_loss(Y_test, lr.predict(X_test)))
25     print(Zero-one loss (unnormalized): %.3f % zero_one_loss(Y_test, lr.predict(X_test), normalize=False))
26     print(Jaccard similarity score: %.3f % jaccard_similarity_score(Y_test, lr.predict(X_test)))
27     # Compute confusion matrix
28     cm = confusion_matrix(y_true=Y_test, y_pred=lr.predict(X_test))
29     print(Confusion matrix:)
30     print(cm)
31     print(Precision score: %.3f % precision_score(Y_test, lr.predict(X_test)))
32     print(Recall score: %.3f % recall_score(Y_test, lr.predict(X_test)))
33     print(F-Beta score (1): %.3f % fbeta_score(Y_test, lr.predict(X_test), beta=1))
34     print(F-Beta score (0.75): %.3f % fbeta_score(Y_test, lr.predict(X_test), beta=0.75))
35     print(F-Beta score (1.25): %.3f % fbeta_score(Y_test, lr.predict(X_test), beta=1.25))
View Code

技术分享图片

  ROC曲线:

技术分享图片
 1 from __future__ import print_function
 2 
 3 import numpy as np
 4 import matplotlib.pyplot as plt
 5 
 6 from sklearn.datasets import make_classification
 7 from sklearn.model_selection import train_test_split
 8 from sklearn.linear_model import LogisticRegression
 9 from sklearn.metrics import roc_curve, auc
10 
11 
12 # For reproducibility
13 np.random.seed(1000)
14 
15 nb_samples = 500
16 
17 
18 if __name__ == __main__:
19     # Create dataset
20     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
21                                n_clusters_per_class=1)
22 
23     # Split dataset
24     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
25 
26     #Create and train logistic regressor
27     lr = LogisticRegression()
28     lr.fit(X_train, Y_train)
29 
30     # Compute ROC curve
31     Y_score = lr.decision_function(X_test)
32     fpr, tpr, thresholds = roc_curve(Y_test, Y_score)
33 
34     plt.figure(figsize=(10, 8))
35 
36     plt.plot(fpr, tpr, color=red, label=Logistic regression (AUC: %.2f) % auc(fpr, tpr))
37     plt.plot([0, 1], [0, 1], color=blue, linestyle=--)
38     plt.xlim([0.0, 1.0])
39     plt.ylim([0.0, 1.01])
40     plt.title(ROC Curve)
41     plt.xlabel(False Positive Rate)
42     plt.ylabel(True Positive Rate)
43     plt.legend(loc="lower right")
44 
45     plt.show()
View Code

技术分享图片

 

机器学习算法,机器让我学习(2)

原文:https://www.cnblogs.com/bai2018/p/10581410.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!