import numpy as np
import pandas as pd
from sklearn.feature_extraction.text
import TfidfVectorizer
from sklearn.linear_model.logistic
import LogisticRegression
from sklearn.model_selection
import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics
import roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing
import LabelEncoder
from sklearn.pipeline
import Pipeline
from sklearn.metrics
import precision_score, recall_score, accuracy_score
pipeline =
Pipeline([
('vect', TfidfVectorizer(stop_words=
'english')),
('clf', LogisticRegression())
])
parameters =
{
'vect__max_df': (0.25, 0.5, 0.75
),
'vect__stop_words': (
'english', None),
'vect__max_features': (2500, 5000
, None),
'vect__ngram_range': ((1, 1), (1, 2
)),
'vect__use_idf': (True, False),
'clf__penalty': (
'l1',
'l2'),
'clf__C': (0.01, 0.1, 1, 10
),
}
df = pd.read_csv(
'./sms.csv')
X = df[
'message']
y = df[
'label']
label_encoder =
LabelEncoder()
y =
label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test =
train_test_split(X, y)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring=
'accuracy', cv=3
)
grid_search.fit(X_train, y_train)
print(
'Best score: %0.3f' %
grid_search.best_score_)
print(
'Best parameters set:')
best_parameters =
grid_search.best_estimator_.get_params()
for param_name
in sorted(parameters.keys()):
print(
'\t%s: %r' %
(param_name, best_parameters[param_name]))
predictions =
grid_search.predict(X_test)
print(
'Accuracy: %s' %
accuracy_score(y_test, predictions))
print(
'Precision: %s' %
precision_score(y_test, predictions))
print(
'Recall: %s' %
recall_score(y_test, predictions))
df = pd.read_csv(
'./sms.csv')
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[
'message'], df[
'label'], random_state=11
)
vectorizer =
TfidfVectorizer()
X_train =
vectorizer.fit_transform(X_train_raw)
X_test =
vectorizer.transform(X_test_raw)
classifier =
LogisticRegression()
classifier.fit(X_train, y_train)
scores = cross_val_score(classifier, X_train, y_train, cv=5
)
print(
'Accuracies: %s' %
scores)
print(
'Mean accuracy: %s' %
np.mean(scores))
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'precision')
print(
'Precision: %s' %
np.mean(precisions))
recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'recall')
print(
'Recall: %s' %
np.mean(recalls))
f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'f1')
print(
'F1 score: %s' % np.mean(f1s))
微调后:
Best score: 0.983Best parameters set: clf__C: 10 clf__penalty: 'l2' vect__max_df: 0.5 vect__max_features: None vect__ngram_range: (1, 2) vect__stop_words: None vect__use_idf: TrueAccuracy: 0.9863701578192252Precision: 0.994535519125683Recall: 0.91
微调前:
Accuracies: [0.95221027 0.95454545 0.96172249 0.96052632 0.95209581]Mean accuracy: 0.9562200683094717Precision: 0.992542742398164Recall: 0.6836050302748021F1 score: 0.8090678466269784
我们可以看到极大的改善了Recall,极大的优化了模型,GridSearchCV其实就是暴力搜索。该方法在小数据集上很有用,数据集大了就不太适用。
在大数据集的情况下,容易造成内存溢出,试试下面的GridSearchCV + SVM的代码,看看是不是溢出了。
import matplotlib.pyplot as plt
from sklearn.datasets
import fetch_mldata, fetch_openml
import matplotlib.cm as cm
from sklearn.metrics
import classification_report
from sklearn.model_selection
import GridSearchCV, train_test_split
from sklearn.pipeline
import Pipeline
from sklearn.svm
import SVC
mnist = fetch_openml(
'mnist_784')
# print(mnist.shape)
counter = 1
for i
in range(1, 4
):
for j
in range(1, 6
):
plt.subplot(3, 5
, counter)
plt.imshow(mnist.data[(i - 1) * 8000 + j*200].reshape((28, 28)), cmap=
cm.Greys_r)
plt.axis('off')
counter += 1
plt.show()
if __name__ ==
'__main__':
X, y =
mnist.data, mnist.target
X = X/255.0*2 - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11
)
pipeline =
Pipeline([
('clf', SVC(kernel=
'rbf', gamma=0.01, C=100
))
])
parameters =
{
'clf__gamma': (0.01, 0.03
),
'clf__C': (0.1, 0.3
),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring=
'accuracy')
grid_search.fit(X_train[:10000], y_train[:10000
])
print(
'Best score: %0.3f' %
grid_search.best_score_)
print(
'Best parameters set:')
best_parameters =
grid_search.best_estimator_.get_params()
for param_name
in sorted(parameters.keys()):
print(
'\t%s: %r' %
(param_name, best_parameters[param_name]))
predictions =
grid_search.predict(X_test)
print(classification_report(y_test, predictions))