近期的事务与sklearn有关,且主要用到了分类。在此做一点笔记
进行分类大概涉及三个知识点:
一. 分类器
二.特征选择
三.模型选择
一.分类器(Classification)
实例一:plot_classifier_comparison.py
# Code source: Gaël Varoquaux
# Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors
import ListedColormap
from sklearn.cross_validation
import train_test_split
from sklearn.preprocessing
import StandardScaler
from sklearn.datasets
import make_moons, make_circles, make_classification
from sklearn.neighbors
import KNeighborsClassifier
from sklearn.svm
import SVC
from sklearn.tree
import DecisionTreeClassifier
from sklearn.ensemble
import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes
import GaussianNB
from sklearn.discriminant_analysis
import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis
import QuadraticDiscriminantAnalysis
h = .02
# step size in the mesh
names = [
"Nearest Neighbors",
"Linear SVM",
"RBF SVM",
"Decision Tree",
"Random Forest",
"AdaBoost",
"Naive Bayes",
"Linear Discriminant Analysis",
"Quadratic Discriminant Analysis"]
classifiers =
[
KNeighborsClassifier(3
),
SVC(kernel=
"linear", C=0.025
),
SVC(gamma=2, C=1
),
DecisionTreeClassifier(max_depth=5
),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1
),
AdaBoostClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2
,
random_state=1, n_clusters_per_class=1
)
rng = np.random.RandomState(2
)
X += 2 * rng.uniform(size=
X.shape)
linearly_separable =
(X, y)
datasets = [make_moons(noise=0.3, random_state=
0),
make_circles(noise=0.2, factor=0.5, random_state=1
),
linearly_separable
]
figure = plt.figure(figsize=(27, 9
))
i = 1
# iterate over datasets
for ds
in datasets:
# preprocess dataset, split into training and test part
X, y =
ds
X =
StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4
)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy =
np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm =
plt.cm.RdBu
cm_bright = ListedColormap([
'#FF0000',
'#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1
, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=
cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6
)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf
in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1
, i)
clf.fit(X_train, y_train)
score =
clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf,
"decision_function"):
Z =
clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1
]
# Put the result into a color plot
Z =
Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8
)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=
cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=
cm_bright,
alpha=0.6
)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, (
'%.2f' % score).lstrip(
'0'),
size=15, horizontalalignment=
'right')
i += 1
figure.subplots_adjust(left=.02, right=.98
)
plt.show()
二.特征选择(Feature Selection)
主要包含下面一个模块 >>> sklearn.feature_selection
例一:feature_selection_pipeline.py
from sklearn.datasets
import samples_generator
from sklearn.feature_selection
import SelectKBest, f_regression
from sklearn
import svm
from sklearn.pipeline
import make_pipeline
# 生成数据
X, y =
samples_generator.make_classification(
n_features=20, n_informative=3, n_redundant=0, n_classes=4
,
n_clusters_per_class=2
)
# 两个步骤
# 1) 方差分析过滤,使用最好的3个特征
anova_filter = SelectKBest(f_regression, k=3
)
# 2) 支持向量机分类
clf = svm.SVC(kernel=
'linear')
# 组合成一个分类器
anova_svm =
make_pipeline(anova_filter, clf)
anova_svm.fit(X, y)
anova_svm.predict(X)
例二:plot_rbm_logistic_classification.py
# Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve
# License: BSD
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage
import convolve
from sklearn
import linear_model, datasets, metrics
from sklearn.cross_validation
import train_test_split
from sklearn.neural_network
import BernoulliRBM
from sklearn.pipeline
import Pipeline
###############################################################################
# Setting up
def nudge_dataset(X, Y):
"""
This produces a dataset 5 times bigger than the original one,
by moving the 8x8 images in X around by 1px to left, right, down, up
"""
direction_vectors =
[
[[0, 1
, 0],
[0, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[1
, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[0, 0, 1
],
[0, 0, 0]],
[[0, 0, 0],
[0, 0, 0],
[0, 1
, 0]]]
shift =
lambda x, w: convolve(x.reshape((8, 8)), mode=
'constant',
weights=
w).ravel()
X = np.concatenate([X] +
[np.apply_along_axis(shift, 1
, X, vector)
for vector
in direction_vectors])
Y = np.concatenate([Y
for _
in range(5)], axis=
0)
return X, Y
# Load Data
digits =
datasets.load_digits()
X = np.asarray(digits.data,
'float32')
X, Y =
nudge_dataset(X, digits.target)
X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)
# 0-1 scaling
X_train, X_test, Y_train, Y_test =
train_test_split(X, Y,
test_size=0.2
,
random_state=
0)
# Models we will use
logistic =
linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=
True)
classifier = Pipeline(steps=[(
'rbm', rbm), (
'logistic', logistic)])
###############################################################################
# Training
# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 20
# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 100
logistic.C = 6000.0
# Training RBM-Logistic Pipeline
classifier.fit(X_train, Y_train)
# Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0
)
logistic_classifier.fit(X_train, Y_train)
###############################################################################
# Evaluation
print()
print(
"Logistic regression using RBM features:\n%s\n" %
(
metrics.classification_report(
Y_test,
classifier.predict(X_test))))
print(
"Logistic regression using raw pixel features:\n%s\n" %
(
metrics.classification_report(
Y_test,
logistic_classifier.predict(X_test))))
###############################################################################
# Plotting
plt.figure(figsize=(4.2, 4
))
for i, comp
in enumerate(rbm.components_):
plt.subplot(10, 10, i + 1
)
plt.imshow(comp.reshape((8, 8)), cmap=
plt.cm.gray_r,
interpolation=
'nearest')
plt.xticks(())
plt.yticks(())
plt.suptitle('100 components extracted by RBM', fontsize=16
)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23
)
plt.show()
三.模型选择(Model Selection)
主要包含下面两个模块
>>> sklearn.grid_search
>>> sklearn.cross_validation
实例一:randomized_search.py
import numpy as np
from time
import time
from operator
import itemgetter
from scipy.stats
import randint as sp_randint
from sklearn.grid_search
import GridSearchCV, RandomizedSearchCV
from sklearn.datasets
import load_digits
from sklearn.ensemble
import RandomForestClassifier
# get some data
digits =
load_digits()
X, y =
digits.data, digits.target
# build a classifier
clf = RandomForestClassifier(n_estimators=20
)
# Utility function to report best scores
def report(grid_scores, n_top=3
):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=
True)[:n_top]
for i, score
in enumerate(top_scores):
print(
"Model with rank: {0}".format(i + 1
))
print(
"Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print(
"Parameters: {0}".format(score.parameters))
print(
"")
# specify parameters and distributions to sample from
param_dist = {
"max_depth": [3
, None],
"max_features": sp_randint(1, 11
),
"min_samples_split": sp_randint(1, 11
),
"min_samples_leaf": sp_randint(1, 11
),
"bootstrap": [True, False],
"criterion": [
"gini",
"entropy"]}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=
param_dist,
n_iter=
n_iter_search)
start =
time()
random_search.fit(X, y)
print(
"RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() -
start), n_iter_search))
report(random_search.grid_scores_)
# use a full grid over all parameters
param_grid = {
"max_depth": [3
, None],
"max_features": [1, 3, 10
],
"min_samples_split": [1, 3, 10
],
"min_samples_leaf": [1, 3, 10
],
"bootstrap": [True, False],
"criterion": [
"gini",
"entropy"]}
# run grid search
grid_search = GridSearchCV(clf, param_grid=
param_grid)
start =
time()
grid_search.fit(X, y)
print(
"GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() -
start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)
实例二:grid_search_text_feature_extraction.py
from pprint
import pprint
from time
import time
import logging
from sklearn.datasets
import fetch_20newsgroups
from sklearn.feature_extraction.text
import CountVectorizer
from sklearn.feature_extraction.text
import TfidfTransformer
from sklearn.linear_model
import SGDClassifier
from sklearn.grid_search
import GridSearchCV
from sklearn.pipeline
import Pipeline
print(
__doc__)
# Display progress logs on stdout
logging.basicConfig(level=
logging.INFO,
format=
'%(asctime)s %(levelname)s %(message)s')
###############################################################################
# Load some categories from the training set
categories =
[
'alt.atheism',
'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None
print(
"Loading 20 newsgroups dataset for categories:")
print(categories)
data = fetch_20newsgroups(subset=
'train', categories=
categories)
print(
"%d documents" %
len(data.filenames))
print(
"%d categories" %
len(data.target_names))
print()
###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline =
Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters =
{
'vect__max_df': (0.5, 0.75, 1.0
),
#'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)),
# unigrams or bigrams
#'tfidf__use_idf': (True, False),
#'tfidf__norm': ('l1', 'l2'),
'clf__alpha': (0.00001, 0.000001
),
'clf__penalty': (
'l2',
'elasticnet'),
#'clf__n_iter': (10, 50, 80),
}
if __name__ ==
"__main__":
# multiprocessing requires the fork to happen in a __main__ protected
# block
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1
)
print(
"Performing grid search...")
print(
"pipeline:", [name
for name, _
in pipeline.steps])
print(
"parameters:")
pprint(parameters)
t0 =
time()
grid_search.fit(data.data, data.target)
print(
"done in %0.3fs" % (time() -
t0))
print()
print(
"Best score: %0.3f" %
grid_search.best_score_)
print(
"Best parameters set:")
best_parameters =
grid_search.best_estimator_.get_params()
for param_name
in sorted(parameters.keys()):
print(
"\t%s: %r" % (param_name, best_parameters[param_name]))
转载于:https://www.cnblogs.com/hhh5460/p/5296847.html