• Home
  • About
    • Muhammad Elnimr's Blog photo

      Muhammad Elnimr's Blog

      Blog.

    • Learn More
    • Twitter
    • LinkedIn
    • Github
  • Posts
    • All Posts
    • All Tags
  • Projects

Kings County house prices.

05 Nov 2017

Reading time ~4 minutes

Very basic housing project:

Goal:

  • Predict the KC county house prices

Importing the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')


from scipy import stats
from sklearn import linear_model
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from math import log

Load data set

data=pd.read_csv('kc_house_data.csv',parse_dates=['date'])
data.describe()
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656873 1788.390691 291.509045 1971.005136 84.402258 98077.939805 47.560053 -122.213896 1986.552492 12768.455652
std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175459 828.090978 442.575043 29.373411 401.679240 53.505026 0.138564 0.140828 685.391304 27304.179631
min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000000 290.000000 0.000000 1900.000000 0.000000 98001.000000 47.155900 -122.519000 399.000000 651.000000
25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000000 1190.000000 0.000000 1951.000000 0.000000 98033.000000 47.471000 -122.328000 1490.000000 5100.000000
50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000000 1560.000000 0.000000 1975.000000 0.000000 98065.000000 47.571800 -122.230000 1840.000000 7620.000000
75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000000 2210.000000 560.000000 1997.000000 0.000000 98118.000000 47.678000 -122.125000 2360.000000 10083.000000
max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 98199.000000 47.777600 -121.315000 6210.000000 871200.000000

Looking for outliers:

data.price.plot(kind='hist',log=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb5db910310>

png

data[data['price']>4e6] # 11 houses greated than 4M
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
1164 1247600105 2014-10-20 5110800.0 5 5.25 8010 45517 2.0 1 4 ... 12 5990 2020 1999 0 98033 47.6767 -122.211 3430 26788
1315 7558700030 2015-04-13 5300000.0 6 6.00 7390 24829 2.0 1 4 ... 12 5000 2390 1991 0 98040 47.5631 -122.210 4320 24619
1448 8907500070 2015-04-13 5350000.0 5 5.00 8000 23985 2.0 0 4 ... 12 6720 1280 2009 0 98004 47.6232 -122.220 4600 21750
2626 7738500731 2014-08-15 4500000.0 5 5.50 6640 40014 2.0 1 4 ... 12 6350 290 2004 0 98155 47.7493 -122.280 3030 23408
3914 9808700762 2014-06-11 7062500.0 5 4.50 10040 37325 2.0 1 2 ... 11 7680 2360 1940 2001 98004 47.6500 -122.214 3930 25449
4411 2470100110 2014-08-04 5570000.0 5 5.75 9200 35069 2.0 0 0 ... 13 6200 3000 2001 0 98039 47.6289 -122.233 3560 24345
7252 6762700020 2014-10-13 7700000.0 6 8.00 12050 27600 2.5 0 3 ... 13 8570 3480 1910 1987 98102 47.6298 -122.323 3940 8800
8092 1924059029 2014-06-17 4668000.0 5 6.75 9640 13068 1.0 1 4 ... 12 4820 4820 1983 2009 98040 47.5570 -122.210 3270 10454
8638 3835500195 2014-06-18 4489000.0 4 3.00 6430 27517 2.0 0 0 ... 12 6430 0 2001 0 98004 47.6208 -122.219 3720 14592
9254 9208900037 2014-09-19 6885000.0 6 7.75 9890 31374 2.0 0 4 ... 13 8860 1030 2001 0 98039 47.6305 -122.240 4540 42730
12370 6065300370 2015-05-06 4208000.0 5 6.00 7440 21540 2.0 0 0 ... 12 5550 1890 2003 0 98006 47.5692 -122.189 4740 19329

11 rows × 21 columns

plt.figure(figsize=(10,8))
data.bedrooms.plot(),data.bathrooms.plot(),data.waterfront.plot()
plt.legend()
<matplotlib.legend.Legend at 0x7fb5db4f7110>

png

data.sqft_living.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fb5db7568d0>

png

BOX PLOT

fig=plt.figure(figsize=(6,10))

ax1=plt.subplot(331)
ax2=plt.subplot(332)
ax3=plt.subplot(333)
#ax4=plt.subplot(334)
#ax5=plt.subplot(335)
#ax6=plt.subplot(336)
#ax7=plt.subplot(337)


data.boxplot(column='price',ax=ax1)
data.boxplot(column='bedrooms',ax=ax2)
data.boxplot(column='bathrooms',ax=ax3)


plt.suptitle('')
plt.tight_layout()

png

data[data['bedrooms']>10]
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
8757 1773100755 2014-08-21 520000.0 11 3.00 3000 4960 2.0 0 0 ... 7 2400 600 1918 1999 98106 47.5560 -122.363 1420 4960
15870 2402100895 2014-06-25 640000.0 33 1.75 1620 6000 1.0 0 0 ... 7 1040 580 1947 0 98103 47.6878 -122.331 1330 4700

2 rows × 21 columns

Removing Outliers

outliers=data.quantile(0.90)
x=data[(data['price']<outliers['price'])]
x=x[(x['bedrooms']< outliers['bedrooms'])]
x=x[(x['bathrooms']< outliers['bathrooms'])]
x=x[(x['sqft_living']< outliers['sqft_living'])]
x.shape
(11712, 21)
data.shape
(21613, 21)

Creating Dummies

x_zipcode=pd.get_dummies(x['zipcode'],drop_first=True)
x=pd.concat([x,x_zipcode],axis=1)

Feature Engineering

x['built_ago']=2017-x['yr_built']
x['have_basement']=np.where(x['sqft_living']>0,1,0)
x['renovated']=np.where(x['yr_renovated']>0,1,0)
x['weighted_bath']=x['bathrooms'] **2
x['weighted_livingspace']=x['sqft_living']**2
x['diff_living']=x['sqft_living']-x['sqft_living15']
x['bed_bath_ratio']=(x['bedrooms']+1)/(x['bathrooms']+1)
y=x.price
x=x.drop(['id','date','zipcode','lat','long','price','yr_built','sqft_basement','bathrooms'],axis=1) # all of them id date zipcode lat long price yr_renovated yr_built sqft_basement bathrooms grade

Train Test Split

# Linear Regression
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
((9369, 88), (9369,), (2343, 88), (2343,))

Use Linear Regression

reg=linear_model.LinearRegression()
regmodel=reg.fit(x_train,y_train)
y_predtest=reg.predict(x_test)
RMS=mean_squared_error(y_test,y_predtest) ** 0.5
RMS
71972.313409169568

Use Lasso Model:

from sklearn.linear_model import Lasso
ls=Lasso()
l=ls.fit(x_train,y_train)
y_ls_predtest=ls.predict(x_test)
ls_rmse=mean_squared_error(y_test,y_ls_predtest) ** 0.5
ls_rmse
71872.274572752882


markdownsyntaxsampletestjekyll Share Tweet +1