import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt
data = pd.read_csv(r"C:\work\learnbetter\micro-class\week 1 DT\data\data.csv",index_col = 0) data.head() data.info()
#删除缺失值过多的列,和观察判断来说和预测的y没有关系的列 data.drop(["Cabin","Name","Ticket"],inplace=True,axis=1) #处理缺失值,对缺失值较多的列进行填补,有一些特征只确实一两个值,可以采取直接删除记录的方法 data["Age"] = data["Age"].fillna(data["Age"].mean()) data = data.dropna() #将分类变量转换为数值型变量 #将二分类变量转换为数值型变量 #astype能够将一个pandas对象转换为某种类型,和apply(int(x))不同,astype可以将文本类转换为数字,用这 个方式可以很便捷地将二分类特征转换为0~1 data["Sex"] = (data["Sex"]== "male").astype("int") #将三分类变量转换为数值型变量 labels = data["Embarked"].unique().tolist() data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x)) #查看处理后的数据集 data.head()
X = data.iloc[:,data.columns != "Survived"] y = data.iloc[:,data.columns == "Survived"] from sklearn.model_selection import train_test_split Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3) #修正测试集和训练集的索引 for i in [Xtrain, Xtest, Ytrain, Ytest]: i.index = range(i.shape[0]) #查看分好的训练集和测试集 Xtrain.head()
clf = DecisionTreeClassifier(random_state=25) clf = clf.fit(Xtrain, Ytrain) score_ = clf.score(Xtest, Ytest) score_ score = cross_val_score(clf,X,y,cv=10).mean() score
tr = [] te = [] for i in range(10): clf = DecisionTreeClassifier(random_state=25 ,max_depth=i+1 ,criterion="entropy" ) clf = clf.fit(Xtrain, Ytrain) score_tr = clf.score(Xtrain,Ytrain) score_te = cross_val_score(clf,X,y,cv=10).mean() tr.append(score_tr) te.append(score_te) print(max(te)) plt.plot(range(1,11),tr,color="red",label="train") plt.plot(range(1,11),te,color="blue",label="test") plt.xticks(range(1,11)) plt.legend() plt.show() #这里为什么使用“entropy”?因为我们注意到,在最大深度=3的时候,模型拟合不足,在训练集和测试集上的表现接近,但却都不是非常理想,只能够达到83%左右,所以我们要使用entropy。
import numpy as np gini_thresholds = np.linspace(0,0.5,20) parameters = {‘splitter‘:(‘best‘,‘random‘) ,‘criterion‘:("gini","entropy") ,"max_depth":[*range(1,10)] ,‘min_samples_leaf‘:[*range(1,50,5)] ,‘min_impurity_decrease‘:[*np.linspace(0,0.5,20)] } clf = DecisionTreeClassifier(random_state=25) GS = GridSearchCV(clf, parameters, cv=10) GS.fit(Xtrain,Ytrain) GS.best_params_ GS.best_score_
机器学习sklearn(三十五):算法实例(四)分类(三)泰坦尼克号幸存者的预测
原文:https://www.cnblogs.com/qiu-hua/p/14920898.html