import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt df = pd.read_csv(‘./sms.csv‘) X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[‘message‘], df[‘label‘], random_state=11) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) scores = cross_val_score(classifier, X_train, y_train, cv=5) print(‘Accuracies: %s‘ % scores) print(‘Mean accuracy: %s‘ % np.mean(scores))
Accuracies: [ 0.95221027 0.95454545 0.96172249 0.96052632 0.95209581] Mean accuracy: 0.956220068309
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring=‘precision‘) print(‘Precision: %s‘ % np.mean(precisions)) recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring=‘recall‘) print(‘Recall: %s‘ % np.mean(recalls)) f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring=‘f1‘) print(‘F1 score: %s‘ % np.mean(f1s))
Precision: 0.992542742398 Recall: 0.683605030275
F1 score: 0.809067846627
F1是精确率和召回率的调和平均值。如果精确度为1,召回为0,那F1为0.还有F0.5和F2两种模型,分别偏重精确率和召回率。在一些场景下,召回率比精确率还更重要。
分类的性能评估:准确率、精确率、Recall召回率、F1、F2
原文:https://www.cnblogs.com/starcrm/p/11718687.html