下面将对数据利用Logistic得到结果。
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score #划分数据 交叉验证
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import accuracy_score
# import statsmodels.api as sm
#混淆矩阵计算
from sklearn import metrics
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
#df_german=pd.read_excel("german_woe.xlsx")
df_german=pd.read_excel("dataset\german.xls")
#df_german=pd.read_excel("df_after_vif.xlsx")
y=df_german.ix[:,-1]
x=df_german.ix[:,:-1]
#x=df_german.ix[:,"Credit Amount":"Purpose"]
l1 = []
for i in range(1000):
print('****'*50)
print('第',i+1,'次test')
# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=i+1)
x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=i+1)
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
# predictions = classifier.predict(X_test)
#验证
print("accuracy on the training subset:{:.3f}".format(classifier.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(classifier.score(x_check,y_check)))
l1.append(classifier.score(x_check,y_check))
print('max index',l1.index(max(l1))+1)
# #得分公式
# '''
# P0 = 50
# PDO = 10
# theta0 = 1.0/20
# B = PDO/np.log(2)
# A = P0 + B*np.log(theta0)
# '''
# def Score(probability):
# #底数是e
# score = A-B*np.log(probability/(1-probability))
# return score
# #批量获取得分
# def List_score(pos_probablity_list):
# list_score=[]
# for probability in pos_probablity_list:
# score=Score(probability)
# list_score.append(score)
# return list_score
# P0 = 50
# PDO = 10
# theta0 = 1.0/20
# B = PDO/np.log(2)
# A = P0 + B*np.log(theta0)
# print("A:",A)
# print("B:",B)
# list_coef = list(classifier.coef_[0])
# intercept= classifier.intercept_
# #获取所有x数据的预测概率,包括好客户和坏客户,0为好客户,1为坏客户
# probablity_list=classifier.predict_proba(x)
# #获取所有x数据的坏客户预测概率
# pos_probablity_list=[i[1] for i in probablity_list]
# #获取所有客户分数
# list_score=List_score(pos_probablity_list)
# list_predict=classifier.predict(x)
# df_result=pd.DataFrame({"label":y,"predict":list_predict,"pos_probablity":pos_probablity_list,"score":list_score})
# # df_result.to_excel("score_proba.xlsx")
# # 打印结果
# # print(df_result)
# #变量名列表
# list_vNames=df_german.columns
# #去掉第一个变量名target
# list_vNames=list_vNames[0:-1]
# df_coef=pd.DataFrame({"variable_names":list_vNames,"coef":list_coef})
# # df_coef.to_excel("coef.xlsx")
# # 打印变量相关度
# # print(df_coef)
# y_true=y_test
# y_pred=classifier.predict(X_test)
# accuracyScore = accuracy_score(y_true, y_pred)
# print('model accuracy is:',accuracyScore)
# #precision,TP/(TP+FP) (真阳性)/(真阳性+假阳性)
# precision=precision_score(y_true, y_pred)
# print('model precision is:',precision)
# #recall(sensitive)敏感度,(TP)/(TP+FN)
# sensitivity=recall_score(y_true, y_pred)
# print('model sensitivity is:',sensitivity)
# #F1 = 2 x (精确率 x 召回率) / (精确率 + 召回率)
# #F1 分数会同时考虑精确率和召回率,以便计算新的分数。可将 F1 分数理解为精确率和召回率的加权平均值,其中 F1 分数的最佳值为 1、最差值为 0:
# f1Score=f1_score(y_true, y_pred)
# print("f1_score:",f1Score)
# def AUC(y_true, y_scores):
# auc_value=0
# #auc第二种方法是通过fpr,tpr,通过auc(fpr,tpr)来计算AUC
# fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
# auc_value= auc(fpr,tpr) ###计算auc的值
# #print("fpr:",fpr)
# #print("tpr:",tpr)
# #print("thresholds:",thresholds)
# if auc_value<0.5:
# auc_value=1-auc_value
# return auc_value
# def Draw_roc(auc_value):
# fpr, tpr, thresholds = metrics.roc_curve(y, list_score, pos_label=0)
# #画对角线
# plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Diagonal line')
# plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' % auc_value)
# plt.title('ROC curve')
# plt.legend(loc="lower right")
# #评价AUC表现
# def AUC_performance(AUC):
# if AUC >=0.7:
# print("good classifier")
# if 0.7>AUC>0.6:
# print("not very good classifier")
# if 0.6>=AUC>0.5:
# print("useless classifier")
# if 0.5>=AUC:
# print("bad classifier,with sorting problems")
# #Auc验证,数据采用测试集数据
# auc_value=AUC(y, list_score)
# print("AUC:",auc_value)
# #评价AUC表现
# AUC_performance(auc_value)
# #绘制ROC曲线
# Draw_roc(auc_value)
# # print(predictions)
# # plt.show()
暂略
原文:https://www.cnblogs.com/LieDra/p/12018588.html