# Import packages and load the full dataset
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,
GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso,
ElasticNet
from sklearn.ensemble import RandomForestRegressor,
GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
# Ignore the warnings
warnings.filterwarnings('ignore')
# Load the data
train = pd.read_csv('train.csv')
Check For Missing Data
# Check missingness:
missingData = train.isnull().mean(axis=0)
# remove is greater than 30%
# index and gives the column names
missingIndex = missingData[missingData>0.3].index
missingIndex
# Make a working copy of the data
workingDf = train.copy()
workingDf.isna().sum().loc[workingDf.isna().sum()>0].sort_values()
Will output any features with missing data.
Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence',
'MiscFeature'], dtype='object')
Electrical 1
MasVnrType 8
MasVnrArea 8
BsmtQual
37
BsmtCond
37
BsmtFinType1 37
BsmtExposure 38
BsmtFinType2 38
GarageCond 81
GarageQual 81
GarageFinish 81
GarageType 81
GarageYrBlt 81
LotFrontage 259
FireplaceQu 690
Fence
1179
Alley
1369
MiscFeature 1406
PoolQC
1453
dtype: int64
Much of the missing data is often just because there is no pool
or fireplace or whatever, so we can replace NULL’s with 0 or “No
Pool”, or whatever the feature. Do this for each feature with
something like this:
#Remove NA from PoolQC
workingDf.loc[pd.Series(workingDf.PoolQC.isna()), 'PoolQC'] =
'NoPool'
Some features may be highly correlated AND have missing values.
Check correlation with feature you might suspect with something
like a scatter plot. For example, LotFrontage is missing a lot
of data but my guess is it is correlated to total LotArea.
# Compare frontage to lot area!
lotFrontageByArea = workingDf[['LotFrontage', 'LotArea']]
plt.scatter(np.log(workingDf['LotArea']),
np.log(workingDf['LotFrontage']))
Appears Highly Correlated
Now, to fill in missing data from a correlated feature we can
make a model of the two, split them into missing and
non-missing, then predict the missing values and recombine!
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso,
ElasticNet
lotByAreaModel = linear_model.LinearRegression()
lotByAreaModel.fit(lotFrontageNoNa[['LotArea']],
lotFrontageNoNa.LotFrontage)
#Spint into missing and not missing
workingDfFrontageNas = workingDf[workingDf.LotFrontage.isna()]
workingDfFrontageNoNas =
workingDf[~workingDf.LotFrontage.isna()]
# Must use Data frame
workingDfFrontageNas.LotFrontage =
lotByAreaModel.predict(workingDfFrontageNas[['LotArea']])
# Must concat a list!!!
workingDfImputedFrontage = pd.concat([workingDfFrontageNas,
workingDfFrontageNoNas], axis = 0)
Next, you will need to dummify the the categorical features. In
some cases this can make our data set extremely “wide” but that
is ok for most of the regression we will be using.
# Now Dummify to, workingDummies
workingDummies = workingClean.copy()
workingDummies = pd.get_dummies(workingDummies)
print(workingDummies.shape)
workingDummies.head()
print(workingDummies.isna().sum().loc[workingDummies.isna().sum()>0].sort_values(ascending=False))
Make sure you have no NA values.
# Replace NAs in Dummies Set with 0
print(workingDummies.isna().sum().loc[workingDummies.isna().sum()>0].sort_values(ascending=False))
Split your data into training and testing sets:
#split feature and salePrice
salePriceClean = workingClean.SalePrice
homeFeaturesClean = workingClean.copy().drop("SalePrice",axis=1)
salePriceDummies = workingDummies.SalePrice
homeFeaturesDummies=workingDummies.copy().drop("SalePrice",axis=1)
Now, for EDA we will visualize the continuous and categorical
features separately. First, here is how to make a grid of
histograms for all numeric values.
# Split into contious and Categorical
workingNumeric = workingClean[['GarageYrBlt', 'LotFrontage',
'LotArea', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'MoSold', 'GarageArea', 'TotRmsAbvGrd', 'GrLivArea',
'BsmtUnfSF', 'MSSubClass', 'YrSold', 'MiscVal', 'BsmtFinSF1',
'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'BsmtFullBath','BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces',
'GarageCars', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch',
'3SsnPorch', 'ScreenPorch', 'PoolArea', 'OverallQual',
'MasVnrArea']]
workingNumeric['SalePrice'] = salePriceClean
Now, plot with plt.
fig = plt.figure(figsize=[20,10])
# get current axis = gca
ax = fig.gca()
# We here will apply to the last one described...
workingNumeric.hist(ax = ax)
plt.subplots_adjust(hspace=0.5)
Then we can look at a correlation heat map for these same
features:
rs = workingNumeric
d = pd.DataFrame(data=workingNumeric,
columns=list(workingNumeric.columns))
# Compute the correlation matrix
corr = d.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sea.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sea.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
Next, this is how we generate a grid of bar charts in Python for
all the categorical features. Unfortunately we unlike the
boxplot code we need to generate each plot separately and put
them in the plot, I will just show code for the first few here.
fig, axes = plt.subplots(13, 3, figsize=(18, 55))
sea.boxplot(ax=axes[0, 0], data=workingCategorical,
x='MSZoning', y='SalePrice')
sea.boxplot(ax=axes[0, 1], data=workingCategorical,
x='SaleType', y='SalePrice')
sea.boxplot(ax=axes[0, 2], data=workingCategorical,
x='GarageType', y='SalePrice')
Finally, onto our model building. Here I will show how to do a
few regression models with the dummified data set and the
results for which was best and how we use that information.
First, the KNN model using grid search to find the best
parameters (alpha):
1
#KNN
knn_model = KNeighborsRegressor()
param_grid = {'n_neighbors':np.arange(5, 200, 5)}
gsModelTrain = GridSearchCV(estimator = knn_model, param_grid =
param_grid, cv=2)
gsModelTrain.fit(featuresDummiesTrain, priceDummiesTrain)
knn_model.set_params(**gsModelTrain.best_params_)
#fit to train data
knn_model.fit(featuresDummiesTrain, priceDummiesTrain)
#Get scores comparing real house prices and predicted house
prices from the test dataset.
print("r2 Test score:", r2_score(priceDummiesTest,
knn_model.predict(featuresDummiesTest)))
print("r2 Train score:", r2_score(priceDummiesTrain,
knn_model.predict(featuresDummiesTrain)))
trainRMSE = np.sqrt(mean_squared_error(y_true=priceDummiesTrain,
y_pred=knn_model.predict(featuresDummiesTrain)))
testRMSE = np.sqrt(mean_squared_error(y_true=priceDummiesTest,
y_pred=knn_model.predict(featuresDummiesTest)))
print("Train RMSE:", trainRMSE)
print("Test RMSE:", testRMSE)
The Results!
r2 Test score: 0.6307760069201898
r2 Train score: 0.7662448448900541
Train RMSE: 37918.53459536937
Test RMSE: 48967.481488778656
Now, do this same exact thing for Ridge(), Lasso, RandomForrest
but alter the parameters being optimized for. Here are my final
results:
We see the random forest or boosting model was best, so next we
grab the most important features from that model, like this:
rfModel.feature_importances_
feature_importances = pd.DataFrame(rfModel.feature_importances_,
index = featuresDummiesTrain.columns,
columns=['importance']).sort_values('importance',ascending=False)
pd.set_option("display.max_rows",None,"display.max_columns",None)
print(feature_importances)
feature_importances.index[:10]
And this is how we get a list of the most important features in
this model using Python!
importance
OverallQual
5.559609e-01
GrLivArea
9.929112e-02
TotalBsmtSF
4.050054e-02
1stFlrSF
3.604577e-02
TotRmsAbvGrd
2.772576e-02
FullBath
2.693519e-02
BsmtFinSF1
2.064975e-02
GarageCars
1.915783e-02
2ndFlrSF
1.753713e-02
GarageArea
1.748563e-02
LotArea
1.219073e-02
YearBuilt
1.075511e-02
LotFrontage
7.270100e-03
YearRemodAdd
7.038709e-03
BsmtQual_Ex
5.726935e-03
OpenPorchSF
4.677578e-03
BsmtUnfSF
4.245650e-03
MoSold
3.397142e-03
OverallCond
3.180477e-03
WoodDeckSF
2.865491e-03
KitchenQual_Gd
2.692117e-03
ExterQual_Ex
2.253200e-03
GarageType_Detchd
1.832978e-03
MSSubClass
1.808349e-03
BsmtFullBath
1.791505e-03
MSZoning_RM
1.781576e-03
ScreenPorch
1.679301e-03
YrSold
1.664580e-03
BsmtExposure_No
1.533721e-03
GarageFinish_Unf
1.514469e-03
MasVnrArea_1170.0
1.431316e-03
So now we know that the overall quality of a home along with its
size is very important but we can basically ignore things like a
finished garage or if it has a screened in porch or not!
You can use this same system to evaluate any home data with any
number of features. Our big take away here is that all the
features on the bottom of this list can be safely ignored so if
you are selling or pricing a home there is no reason to even
take them into account most of the time.
Happy house hunting and thank you!