其实这篇没啥内容,就是在熟悉一下代码\((*^_^*)\)
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import datasets
digits = datasets.load_digits()
#从样本数据中选出2/3作为训练集,1/3个作为测试集,并打乱数据集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target,
test_size = 1/3)
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier()
treeclf.fit(X_train,y_train)
treeclf.score(X_test,y_test)
from sklearn.cluster import KMeans
clt = KMeans(n_clusters=3)
clt.fit(X)
# 得到每种元素的分类
print(clt.predict(X))
# 得到聚类的中心
print(clt.cluster_centers_)
PCA主成分分析(Principal Components Analysis)是最常使用的降维算法,其基本思想如下:将原先的n个特征用数目更少的m个特征取代,新特征是旧特征的线性组合,这些线性组合最大化样本方差,从而保留样本尽可能多的信息,并且m个特征互不相关。用几何观点来看,PCA主成分分析方法可以看成通过正交变换,对坐标系进行旋转和平移,并保留样本点投影坐标方差最大的前几个新的坐标。
通过PCA主成分分析,可以帮助去除样本中的噪声信息,便于进一步做回归分析。
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(boston.data,boston.target,
test_size = 1/3,random_state = 0)
(len(X_train),len(X_test))
# 特征极差标准化
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
X_train,X_test = scaler.transform(X_train),scaler.transform(X_test)
这里explained_variance_
衡量的是各维度的方差
# 应用PCA进行降维
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
pca.fit(X_train)
pca.explained_variance_
X_train_pca,X_test_pca = pca.transform(X_train),pca.transform(X_test)
# 对降维后的数据进行回归分析
from sklearn.linear_model import ElasticNetCV
netreg = ElasticNetCV()
netreg.fit(X_train_pca,y_train)
netreg.predict(X_test_pca)
netreg.score(X_test_pca,y_test)
scikit基础与机器学习入门(8) sklearn主要解决的三类问题——分类,回归和聚类
原文:https://www.cnblogs.com/xiaoyunbowen/p/15312561.html