In the world of real estate, determining property prices involves numerous factors, from location and size to amenities and market trends. Simple linear regression, a foundational technique in machine learning, provides a practical way to predict housing prices based on key features like the number of rooms or square footage.
In this article, I delve into the process of applying simple linear regression to a housing dataset, from data preprocessing and feature selection to building a model that can offer valuable price insights. Whether you’re new to data science or seeking to deepen your understanding, this project serves as a hands-on exploration of how data-driven predictions can shape smarter real estate decisions.
First things first, you start by importing your libraries:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
#Read from the directory where you stored the data
data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
data
#Test to see if there arent any null values
data.info()
#Trying to draw the same number of null values
data.dropna(inplace = True)
data.info()
#From our data, we are going to train and test our data
from sklearn.model_selection import train_test_split
X = data.drop(['median_house_value'], axis = 1)
y = data['median_house_value']
y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
#Examining correlation between x and y training data
train_data = X_train.join(y_train)
train_data
#Visualizing the above
train_data.hist(figsize=(15, 8))
#Encoding non-numeric columns to see if they are useful and categorical for analysis
train_data_encoded = pd.get_dummies(train_data, drop_first=True)
correlation_matrix = train_data_encoded.corr()
print(correlation_matrix)
train_data_encoded.corr()
plt.figure(figsize=(15,8))
sns.heatmap(train_data_encoded.corr(), annot=True, cmap = "inferno")
train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'] +1)
train_data['population'] = np.log(train_data['population'] + 1)
train_data['households'] = np.log(train_data['households'] + 1)
train_data.hist(figsize=(15, 8))
#convert ocean_proximity factors into binary's using one_hot_encoding
train_data.ocean_proximity.value_counts()
ocean_proximity
<1H OCEAN 7267
INLAND 5183
NEAR OCEAN 2108
NEAR BAY 1783
ISLAND 5
Name: count, dtype: int64
#For each feature of the above we will then create its binary(0 or 1)
pd.get_dummies(train_data.ocean_proximity)
#Dropping afterwards the proximity
train_data = train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'], axis=1)
train_data
#recheck for correlation
plt.figure(figsize=(18, 8))
sns.heatmap(train_data.corr(), annot=True, cmap ='twilight')
#visualize the coordinates
plt.figure(figsize=(15, 8))
sns.scatterplot(x='latitude',
y = 'longitude',
data= train_data,
hue='median_house_value', palette='Spectral')
#Combine new features with the ones we already have(using feature engineering)
train_data['bedroom_ratio'] = train_data['total_bedrooms']/train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms']/train_data['households']
#show correlation
plt.figure(figsize=(18, 8))
sns.heatmap(train_data.corr(), annot=True, cmap ='ocean')
#train data using linear regression
from sklearn.linear_model import LinearRegression
X_train, y_train = train_data.drop(['median_house_value'], axis=1), train_data['median_house_value']
reg = LinearRegression()
reg.fit(X_train, y_train)
# Assuming 'data' is the original dataset with 'ocean_proximity'
test_data = X_test.join(y_test)
test_data['ocean_proximity'] = data.loc[test_data.index, 'ocean_proximity']
#Join X_test and y_test to form test_data
test_data = X_test.join.join(y_test)
#Apply log transformations
test_data['total_rooms'] = np.log(test_data['total_rooms'] + 1)
test_data['total_bedrooms'] = np.log(test_data['total_bedrooms'] + 1
test_data['population'] = np.log(test_data['population'] + 1)
test_data['households'] = np.log(test_data['households'] + 1)
#One-hot encode 'ocean_proximity' with the same prefix as in training data
#Assuming the training data used no prefix or a different one, adjust accordingly
#test_data = test_data.join(pd.get_dummies(test_data['ocean_proximity'], prefix='')).drop(['ocean_proximity'], axis=1) #prefix =''for no prefix
# or rename columns to match training data after one-hot encoding
test_data = test_data.rename(columns = {
'ocean_<1H OCEAN': '<1H OCEAN',
'ocean_INLAND : 'INLAND',
'ocean_ISLAND : 'ISLAND',
'ocean_NEAR BAY: 'NEAR BAY',
'ocean_NEAR OCEAN : 'NEAR OCEAN'
})
#Create new feature columns
test_data['bedroom_ratio'] = test_data['total_bedrooms']/test_data['total_rooms']
test_data['household_rooms'] = test_data['total_rooms']/test_data['household_rooms]
#test_data = test_data.join(pd.get_dummies(test_data['ocean_proximity']. prefix='')).drop(['ocean_proximity'], axis=1) #prefix='' for no prefix #or rename columns to match training data after one-hot encoding:
test_data = test_data.rename(columns = {
'ocean_<1H OCEAN': '<1H OCEAN',
'ocean_INLAND : 'INLAND',
'ocean_ISLAND : 'ISLAND',
'ocean_NEAR BAY: 'NEAR BAY',
'ocean_NEAR OCEAN : 'NEAR OCEAN'
})
X_test, y_test = test_data.drop(['median_house_value'], axis=1), test_data['median_house_value']
X_test_s = scaler.transfrom(X_test)
reg.score(X_test_s, y_test)
0.5092972905670141
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
forest = RandomForestRegressor()
forest.fit(X_train_s, y_train)
forest.score(X_test_s, y_test)
0.4447616558596853
from sklearn.model_selection import GridSearchCV
param_grid ={
'n_estimators': [3, 10, 30],
'max_features' : [2, 4, 6, 8]
}
grid_search = GridSearchCV(forest, Param_grid,
cv=5,
scoring ="neg_mean_squared_error",
return_train_score=True)
grid_search.fit(X_train_s, y_train)
grid_search.best_estimator_
grid_search.best_estimator_.score(X_test_s, y_test)
0.5384474921332503
I would really say that training a machine is not the easiest of processes but to keep improving the results above you can add more features under the param_grid such as the min_feature and in that way your best estimator score can keep on improvimng.
If you got till this far please like and share your comment below, your opinion really matters. Thank you!😊🥰❤️