import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import pydot
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
train = pd.read_csv("train2.csv", dtype={"Age": np.float64},)
print train.head(10)
def harmonize_data(titanic):
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
return titanic
harmonize_data(train)
print "ok"
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
results = []
sample_leaf_options = list(range(1, 500, 3))
n_estimators_options = list(range(1, 1000, 5))
groud_truth = train[‘Survived‘][601:]
alg = RandomForestClassifier(min_samples_leaf=50, n_estimators=5, random_state=50)
alg.fit(train[predictors][:600], train[‘Survived‘][:600])
predict = alg.predict(train[predictors][601:])
#print groud_truth == predict
results.append((50, 5, (groud_truth == predict).mean()))
#print((groud_truth == predict).mean())
print(results)
Estimators = alg.estimators_
for index, model in enumerate(Estimators):
filename = ‘iris_‘ + str(index) + ‘.pdf‘
dot_data = tree.export_graphviz(model , out_file=None,
feature_names=predictors,
class_names=["die","live"],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
graph.write_pdf(filename)
前提需要安装graphviz
yum install graphviz
涉及到的训练集参考上一篇文章
原文:http://blog.51cto.com/12597095/2160408