notebook来自https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn
1.导入数据,查看数据结构和缺失值情况重点在于查看缺失值情况的写法:NAs = pd.concat([train.isnull().sum(), test.isnull().sum()], axis = 1, keys = ['train', 'test']) NAs[NAs.sum(axis=1) > 0]2.数据预处理(删除无用特征,特征转化,缺失值填充,构造新特征,特征值标准化,转化为dummy)Q:什么样的特征需要做转化?A:如某些整型数据只表示类别,其数值本身没有意义,则应转化为dummy重点学习手动将特征转化为dummy的方法(这里情况稍微还要复杂一点,因为存在同一特征对应两列的情况,如Condition1,Condition2)3.随机打乱数据,分离训练集和测试集4.构建多个单一模型5.模型融合
1.如何判断一个特征是否是无用特征?
2.模型融合的方法?这里为什是np.exp(GB_model.predict(test_features)) + np.exp(ENS_model.predict(test_features_std))?
3.为什么label分布偏斜需要做转化?
In [33]: #Kaggle: House Prices: Advanced Regression Techniques import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn import ensemble, linear_model, tree from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_squared_error, r2_score from sklearn.utils import shuffle %matplotlib inline import warnings warnings.filterwarnings('ignore') train = pd.read_csv('downloads/train.csv') test = pd.read_csv('downloads/test.csv') In [8]: train.head() Out[8]: IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal2085001220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal1815002360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal2235003470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml1400004560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal2500005 rows × 81 columns
In [9]: #检查缺失值 NAs = pd.concat([train.isnull().sum(), test.isnull().sum()], axis = 1, keys = ['train', 'test']) #sum()默认的axis=0,即跨行 NAs[NAs.sum(axis=1) > 0] #只显示有缺失值的特征 Out[9]: traintestAlley13691352.0BsmtCond3745.0BsmtExposure3844.0BsmtFinSF101.0BsmtFinSF201.0BsmtFinType13742.0BsmtFinType23842.0BsmtFullBath02.0BsmtHalfBath02.0BsmtQual3744.0BsmtUnfSF01.0Electrical10.0Exterior1st01.0Exterior2nd01.0Fence11791169.0FireplaceQu690730.0Functional02.0GarageArea01.0GarageCars01.0GarageCond8178.0GarageFinish8178.0GarageQual8178.0GarageType8176.0GarageYrBlt8178.0KitchenQual01.0LotFrontage259227.0MSZoning04.0MasVnrArea815.0MasVnrType816.0MiscFeature14061408.0PoolQC14531456.0SaleType01.0TotalBsmtSF01.0Utilities02.0 In [10]: #打印R2和RMSE得分 def print_score (prediction, labels): print('R2: {}'.format(r2_score(prediction, labels))) print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels)))) #对给定的模型进行评估,分别打印训练集上的得分和测试集上的得分 def train_test_score(estimator, x_train, x_test, y_train, y_test): train_predictions = estimator.predict(x_train) print('------------train-----------') print_score(train_predictions, y_train) print('------------test------------') test_predictions = estimator.predict(x_test) print_score(test_predictions, y_test) In [11]: #将标签从训练集中分离出来 train_label = train.pop('SalePrice') #将训练集特征和测试集特征拼在一起,便于一起删除无用的特征 features = pd.concat([train, test], keys = ['train', 'test']) #删除无用特征(为什么说它们是无用特征并没有解释) features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'], axis=1, inplace=True) print(features.shape) (2919, 56) In [12]: #将series数据转化为str #问题:什么样的数据需要转化为str #答:将原来的某些整型数据转化为str,这些整型数据数字大小本身并没有含义,而只是代表一个类,所以转化为str后,后续再转化为dummy features['MSSubClass'] = features['MSSubClass'].astype(str) #pandas调用特征的两种方法:.feature和['feature'],两者效果相同,下面就是.feature方法 features.OverallCond = features.OverallCond.astype(str) features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str) features['YrSold'] = features['YrSold'].astype(str) features['MoSold'] = features['MoSold'].astype(str) #用众数填充缺失值 features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0]) features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0]) features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0]) features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0]) features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0]) #用某个特定值填充缺失值 features['LotFrontage'] = features['LotFrontage'].fillna(features['LotFrontage'].mean()) features['Alley'] = features['Alley'].fillna('NOACCESS') for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): features[col] = features[col].fillna('NoBSMT') features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0) features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP') for col in ('GarageType', 'GarageFinish', 'GarageQual'): features[col] = features[col].fillna('NoGRG') features['GarageCars'] = features['GarageCars'].fillna(0.0) #构造新特征 features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF'] features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True) print(features.shape) (2919, 54) In [13]: #查看房价分布情况 ax = sns.distplot(train_label) In [14]: #发现图像整体向左倾斜,所以做log转变 train_label = np.log(train_label) ax = sns.distplot(train_label) In [15]: #对数字特征做标准化处理 num_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']] num_features_standarized = (num_features - num_features.mean()) / num_features.std() num_features_standarized.head() Out[15]: LotFrontageLotAreaGrLivAreaTotalSFtrain0-0.202033-0.2178410.4134760.02299910.501785-0.072032-0.471810-0.0291672-0.0612690.1371730.5636590.1968863-0.436639-0.0783710.427309-0.09251140.6894690.5188141.3778060.988072 In [16]: ax = sns.pairplot(num_features_standarized) In [17]: #重点 #convert categorical data to dummies #将所有condition不重复的记录在一个set中 conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']]) #自定义dummy变量,行数为阳历数,列数为原condition数据转化为dummy后的维数 dummies = pd.DataFrame(data = np.zeros((len(features.index), len(conditions))), index = features.index, columns = conditions) #遍历所有样例,将原来的condition信息转化为对应的dummy信息 for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])): #用ix找到位置,注意cond可能包含Condition1和Condition2两个位置的信息,对应dummies数组的两个点,所以需要用ix而不能简单的直接用dummies[i,cond] dummies.ix[i, cond] = 1 #将dummy后的特征数据拼接到原features后面,并给dummy特征的index增加前缀 features = pd.concat([features, dummies.add_prefix('Cond_')], axis = 1) #最后就可以删除原来的Condition特征 features.drop(['Condition1', 'Condition2'], axis = 1, inplace =True) print(features.shape) (2919, 61) In [18]: features.head() Out[18]: IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourLotConfig...TotalSFCond_PosACond_ArteryCond_PosNCond_RRAnCond_RRAeCond_FeedrCond_NormCond_RRNnCond_RRNetrain0160RL65.08450PaveNOACCESSRegLvlInside...2566.00.00.00.00.00.00.01.00.00.01220RL80.09600PaveNOACCESSRegLvlFR2...2524.00.00.00.00.00.01.01.00.00.02360RL68.011250PaveNOACCESSIR1LvlInside...2706.00.00.00.00.00.00.01.00.00.03470RL60.09550PaveNOACCESSIR1LvlCorner...2473.00.00.00.00.00.00.01.00.00.04560RL84.014260PaveNOACCESSIR1LvlFR2...3343.00.00.00.00.00.00.01.00.00.05 rows × 61 columns
In [19]: #convert Exterior to dummies Exterior = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']]) dummies = pd.DataFrame(data = np.zeros([len(features.index), len(Exterior)]), index = features.index, columns = Exterior) for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])): dummies.ix[i, ext] = 1 features = pd.concat([features, dummies.add_prefix('Ext_')], axis = 1) features.drop(['Exterior1st', 'Exterior2nd', 'Ext_nan'], axis = 1, inplace = True) print(features.shape) (2919, 78) In [20]: features.dtypes[features.dtypes == 'object'].index Out[20]: Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle', 'OverallCond', 'RoofStyle', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenAbvGr', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'PavedDrive', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition'], dtype='object') In [21]: #遍历特定类型数据的方法:for col in features.dtypes[features.dtypes == 'object'].index #convert all other categorical vars to dummies for col in features.dtypes[features.dtypes == 'object'].index: for_dummy = features.pop(col) features = pd.concat([features, pd.get_dummies(for_dummy, prefix = col)], axis = 1) print(features.shape) (2919, 263) In [22]: #用之前几个标准化的数据更新features features_standardized = features.copy() features_standardized.update(num_features_standarized) In [23]: #重新分离训练集和测试集 #首先分离没有标准化的features train_features = features.loc['train'].drop(['Id'], axis=1).select_dtypes(include=[np.number]).values test_features = features.loc['test'].drop(['Id'], axis=1).select_dtypes(include=[np.number]).values #再分离标准化的数据 train_features_std = features_standardized.loc['train'].drop(['Id'], axis=1).select_dtypes(include=[np.number]).values test_features_std = features_standardized.loc['test'].drop(['Id'], axis=1).select_dtypes(include=[np.number]).values print(train_features.shape) print(train_features_std.shape) (1460, 262) (1460, 262) In [24]: #shuffle train dataset train_features_std, train_features, train_label = shuffle(train_features_std, train_features, train_label, random_state = 5) In [25]: #split train and test data x_train, x_test, y_train, y_test = train_test_split(train_features, train_label, test_size = 0.1, random_state = 200) x_train_std, x_test_std, y_train_std, y_test_std = train_test_split(train_features_std, train_label, test_size = 0.1, random_state = 200) In [26]: #构建第一个模型:ElasticNet ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_std, y_train_std) train_test_score(ENSTest, x_train_std, x_test_std, y_train_std, y_test_std) ------------train----------- R2: 0.9009283127352861 RMSE: 0.11921419084690392 ------------test------------ R2: 0.8967299522701895 RMSE: 0.11097042840114624 In [27]: #测试模型的交叉验证得分 score = cross_val_score(ENSTest, train_features_std, train_label, cv = 5) print('Accurary: %0.2f +/- %0.2f' % (score.mean(), score.std()*2)) Accurary: 0.88 +/- 0.10 In [28]: #构建第二个模型:GradientBoosting GB = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate = 0.05, max_depth = 3, max_features = 'sqrt', min_samples_leaf = 15, min_samples_split = 10, loss = 'huber').fit(x_train_std, y_train_std) train_test_score(GB, x_train_std, x_test_std, y_train_std, y_test_std) ------------train----------- R2: 0.9607778449577035 RMSE: 0.07698826081848897 ------------test------------ R2: 0.9002871760789876 RMSE: 0.10793269100940146 In [29]: #构建第二个模型:GradientBoosting GB = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate = 0.05, max_depth = 3, max_features = 'sqrt', min_samples_leaf = 15, min_samples_split = 10, loss = 'huber').fit(x_train_std, y_train_std) train_test_score(GB, x_train_std, x_test_std, y_train_std, y_test_std) Accurary: 0.90 +/- 0.04 In [30]: #模型融合 GB_model = GB.fit(train_features, train_label) ENS_model = ENSTest.fit(train_features_std, train_label) In [31]: #为什么模型融合公式是这样的? Final_score = (np.exp(GB_model.predict(test_features)) + np.exp(ENS_model.predict(test_features_std))) / 2 In [32]: #写入csv文件 pd.DataFrame({'Id':test.Id, 'SalePrice':Final_score}).to_csv('submit.csv', index=False)转载于:https://www.cnblogs.com/RB26DETT/p/11566650.html
相关资源:House Prices Advanced Regression Techniques