import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model
import Ridge
from sklearn.metrics
import r2_score, mean_squared_error
from sklearn.neighbors
import KNeighborsRegressor
from sklearn.preprocessing
import StandardScaler
X_train =
np.array([
[158, 1
],
[170, 1
],
[183, 1
],
[191, 1
],
[155
, 0],
[163
, 0],
[180
, 0],
[158
, 0],
[170
, 0]
])
ss =
StandardScaler()
X_trainss =
ss.fit_transform(X_train)
X_train_log = np.log(X_train + 1
)
y_train = [64, 86, 84, 80, 49, 59, 67, 54, 67
]
X_test =
np.array([
[160, 1
],
[196, 1
],
[168
, 0],
[177
, 0]
])
X_testss =
ss.transform(X_test)
X_test_log = np.log(X_test + 1
)
y_test = [66, 87, 68, 74
]
K = 5
clf = KNeighborsRegressor(n_neighbors=
K)
clf.fit(X_train, y_train)
clf1 = KNeighborsRegressor(n_neighbors=
K)
clf1.fit(X_trainss, y_train)
clf2 =
Ridge().fit(X_train_log, y_train)
predictions =
clf.predict(np.array(X_test))
predictions1 =
clf1.predict(X_testss)
predictions2 =
clf2.predict(X_test_log)
print(
'Actual weights: %s' %
y_test)
print(
'Predicted weights: %s' %
predictions)
print(
'Predicted weights StandardScaler: %s' %
predictions1)
print(
'Predicted weights Log: %s' %
predictions2)
print(mean_squared_error(y_test, predictions))
print(mean_squared_error(y_test, predictions1))
print(mean_squared_error(y_test, predictions2))
print(r2_score(y_test, predictions))
print(r2_score(y_test, predictions1))
print(r2_score(y_test, predictions2))
结果是:
Actual weights: [66, 87, 68, 74]Predicted weights: [62.4 76.8 66. 72.6]Predicted weights by StandardScaler: [69.4 76.8 59.2 59.2]Predicted weights by Log: [72.98731557 73.88528401 63.37281696 63.60369452]mean_squared_error: 30.740000000000023mean_squared_error by StandardScaler: 103.02mean_squared_error by Log: 87.578086240788960.5424744186046508-0.5333209302325581-0.30348779521174274
Process finished with exit code 0
我们发现特征值经过标准化和对数化的转换后,预测均方差偏移反而更大了。本例来自《scikit-learn机器学习》的第三章的最后一个例子的延展,所以书中的例子感觉还有缺陷,原因是样本太少了。