import numpy as np
import pandas as pd
from sklearn.feature_extraction.text
import TfidfVectorizer
from sklearn.linear_model.logistic
import LogisticRegression
from sklearn.model_selection
import train_test_split, cross_val_score
from sklearn.metrics
import roc_curve, auc
import matplotlib.pyplot as plt
df = pd.read_csv(
'./sms.csv')
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[
'message'], df[
'label'], random_state=11
)
vectorizer =
TfidfVectorizer()
X_train =
vectorizer.fit_transform(X_train_raw)
X_test =
vectorizer.transform(X_test_raw)
classifier =
LogisticRegression()
classifier.fit(X_train, y_train)
scores = cross_val_score(classifier, X_train, y_train, cv=5
)
print(
'Accuracies: %s' %
scores)
print(
'Mean accuracy: %s' % np.mean(scores))
Accuracies: [ 0.95221027 0.95454545 0.96172249 0.96052632 0.95209581]
Mean accuracy: 0.956220068309
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'precision')
print(
'Precision: %s' %
np.mean(precisions))
recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'recall')
print(
'Recall: %s' %
np.mean(recalls))
f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'f1')
print(
'F1 score: %s' % np.mean(f1s))
Precision: 0.992542742398
Recall: 0.683605030275
F1 score: 0.809067846627F1是精确率和召回率的调和平均值。如果精确度为1,召回为0,那F1为0.还有F0.5和F2两种模型,分别偏重精确率和召回率。在一些场景下,召回率比精确率还更重要。常用分类的对比
from sklearn.linear_model
import LogisticRegression
from sklearn.neighbors
import KNeighborsClassifier
from sklearn.svm
import SVC
from sklearn.tree
import DecisionTreeClassifier
from sklearn.ensemble
import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets
import make_classification
from sklearn.model_selection
import train_test_split
from sklearn.metrics
import classification_report
X, y =
make_classification(
n_samples=5000, n_features=100, n_informative=20, n_clusters_per_class=2, random_state=11
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11
)
print(
'决策树')
clf = DecisionTreeClassifier(random_state=11
)
clf.fit(X_train, y_train)
predictions =
clf.predict(X_test)
print(classification_report(y_test, predictions))
print(
'随机森林')
clf = RandomForestClassifier(n_estimators=10, random_state=11
)
clf.fit(X_train, y_train)
predictions =
clf.predict(X_test)
print(classification_report(y_test, predictions))
print(
'逻辑回归')
clf =
LogisticRegression()
clf.fit(X_train, y_train)
predictions =
clf.predict(X_test)
print(classification_report(y_test, predictions))
print(
'AdaBoost')
clf = AdaBoostClassifier(n_estimators=50, random_state=11
)
clf.fit(X_train, y_train)
predictions =
clf.predict(X_test)
print(classification_report(y_test, predictions))
print(
'KNN近邻')
clf = KNeighborsClassifier(n_neighbors=3
)
clf.fit(X_train,y_train)
predictions =
clf.predict(X_test)
print(classification_report(y_test, predictions))
print(
'SVM支持向量机')
clf = SVC(kernel=
'rbf', C=100, gamma=0.1
).fit(X, y)
predictions =
clf.predict(X_test)
print(classification_report(y_test, predictions))
结果
决策树
precision recall f1-
score support
0 0.80 0.76 0.78 634
1 0.76 0.80 0.78 616
accuracy 0.78 1250
macro avg 0.78 0.78 0.78 1250
weighted avg 0.78 0.78 0.78 1250
随机森林
precision recall f1-
score support
0 0.79 0.86 0.82 634
1 0.84 0.76 0.80 616
accuracy 0.81 1250
macro avg 0.82 0.81 0.81 1250
weighted avg 0.82 0.81 0.81 1250
逻辑回归
precision recall f1-
score support
0 0.82 0.85 0.84 634
1 0.84 0.81 0.83 616
accuracy 0.83 1250
macro avg 0.83 0.83 0.83 1250
weighted avg 0.83 0.83 0.83 1250
AdaBoost precision recall f1-score support
0 0.83 0.85 0.84 634 1 0.84 0.82 0.83 616
accuracy 0.83 1250 macro avg 0.83 0.83 0.83 1250weighted avg 0.83 0.83 0.83 1250
KNN近邻 precision recall f1-score support
0 0.93 0.93 0.93 634 1 0.93 0.93 0.93 616
accuracy 0.93 1250 macro avg 0.93 0.93 0.93 1250weighted avg 0.93 0.93 0.93 1250
SVM支持向量机 precision recall f1-score support
0 1.00 1.00 1.00 634 1 1.00 1.00 1.00 616
accuracy 1.00 1250 macro avg 1.00 1.00 1.00 1250weighted avg 1.00 1.00 1.00 1250