There won't be much text, mostly a solution. I took 2nd place among 60 classmates, and 67th place out of 487 on kaggle.
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')
sns.set_context("paper", font_scale=1.5)
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_rows', 50)
def adjustedR2(r2,n,k):
return r2-(k-1)/(n-k)*(1-r2)
TRAIN_DATASET_PATH = 'project/train.csv'
TEST_DATASET_PATH = 'project/test.csv'
1. Uploading data
Description of the dataset
- Id - apartment identification number
- DistrictId - identification number of the district
- Rooms - number of rooms
- Square - Square
- Life Square - living area
- KitchenSquare - kitchen area
- Floor - floor
- House Floor - number of floors in the house
- House Year - the year the house was built
- Ecology_1, Ecology_2, Ecology_3 - environmental indicators of the area
- Social_1, Social_2, Social_3 - social indicators of the area
- Healthcare_1, Healthcare_2 - terrain indicators related to health protection
- Shops_1, Shops_2 - indicators related to the availability of stores, shopping centers
- Price - apartment price
train_df = pd.read_csv(TRAIN_DATASET_PATH, index_col='Id')
train_df.head()
test_df = pd.read_csv(TEST_DATASET_PATH, index_col='Id')
test_df.head()
train_df.shape, test_df.shape
I will use the option offered in the lesson and my own thoughts. The first step is to get acquainted with the data. Let's build their histograms, and those that are not many - boxplot.
train_df.columns
df1=train_df[['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
'Shops_1', 'Shops_2', 'Price']]
h = df1.hist(bins=25,figsize=(16,16),xlabelsize='10',ylabelsize='10',xrot=-15)
sns.despine(left=True, bottom=True)
[x.title.set_size(12) for x in h.ravel()];
[x.yaxis.tick_left() for x in h.ravel()];
sns.set(style="whitegrid", font_scale=1)
f, axes = plt.subplots(1, 2,figsize=(15,5))
sns.boxplot(x=train_df['Rooms'],y=train_df['Price'], ax=axes[0])
sns.boxplot(x=train_df['Helthcare_2'],y=train_df['Price'], ax=axes[1])
sns.despine(left=True, bottom=True)
axes[0].set(xlabel='Rooms', ylabel='Price')
axes[0].yaxis.tick_left()
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
axes[1].set(xlabel='Helthcare_2', ylabel='Price')
f, axe = plt.subplots(1, 1,figsize=(12.18,5))
sns.despine(left=True, bottom=True)
sns.boxplot(x=train_df['Floor'],y=train_df['Price'], ax=axe)
axe.yaxis.tick_left()
axe.set(xlabel='Floor', ylabel='Price');
Of course, you can still play with charts for a long time, but it is unlikely that you will be able to visually identify the impact. I'll look at the correlation between the data.
features = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
'Shops_1', 'Shops_2', 'Price']
mask = np.zeros_like(train_df[features].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(16, 12))
plt.title('Pearson Correlation Matrix',fontsize=25)
sns.heatmap(train_df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="BuGn", #"BuGn_r" to reverse
linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});
To prepare, I will use an example from the lesson
train_df.dtypes
train_df['DistrictId'] = train_df['DistrictId'].astype(str)
test_df['DistrictId'] = test_df['DistrictId'].astype(str)
train_df.describe()
feature_num_names = train_df.drop('Price', axis=1).select_dtypes(include=['float64', 'int64']).\
columns.tolist()
feature_num_names
feature_cat_names = train_df.select_dtypes(include='object').columns.tolist()
feature_cat_names
feature_bin_names = ['Ecology_2', 'Ecology_3', 'Shops_2']
train_df[feature_bin_names] = train_df[feature_bin_names].replace({'A':0, 'B':1})
test_df[feature_bin_names] = test_df[feature_bin_names].replace({'A':0, 'B':1})
train_df.info()
train_df.isna().sum()[train_df.isna().sum() != 0]
train_df.loc[train_df['LifeSquare'].isna(), 'LifeSquare'] = \
train_df['Square'] - train_df['KitchenSquare']
test_df.loc[test_df['LifeSquare'].isna(), 'LifeSquare'] = \
test_df['Square'] - test_df['KitchenSquare']
pd.concat([train_df.groupby('DistrictId')['Healthcare_1'].apply(lambda x: np.nanmean(x)),
train_df.groupby('DistrictId')['Healthcare_1'].apply(lambda x: np.nanstd(x)),
train_df.groupby('DistrictId')['Healthcare_1'].count(),
train_df.groupby('DistrictId')['Healthcare_1'].apply(lambda x: x.isna().sum())
],
axis=1, keys=['mean', 'std', 'count', 'nans']).sort_values(by='nans', ascending=False).head(10)
pd.concat([train_df.groupby('Helthcare_2')['Healthcare_1'].apply(lambda x: np.nanmean(x)),
train_df.groupby('Helthcare_2')['Healthcare_1'].apply(lambda x: np.nanstd(x)),
train_df.groupby('Helthcare_2')['Healthcare_1'].count(),
train_df.groupby('Helthcare_2')['Healthcare_1'].apply(lambda x: x.isna().sum())
],
axis=1, keys=['mean', 'std', 'count', 'nans']).sort_values(by='nans', ascending=False)
pd.concat([test_df.groupby('Helthcare_2')['Healthcare_1'].apply(lambda x: np.nanmean(x)),
test_df.groupby('Helthcare_2')['Healthcare_1'].apply(lambda x: np.nanstd(x)),
test_df.groupby('Helthcare_2')['Healthcare_1'].count(),
test_df.groupby('Helthcare_2')['Healthcare_1'].apply(lambda x: x.isna().sum())
],
axis=1, keys=['mean', 'std', 'count', 'nans']).sort_values(by='nans', ascending=False)
train_df.loc[train_df['Healthcare_1'].isna(), 'Healthcare_1'] =\
train_df.groupby(['Helthcare_2'])['Healthcare_1'].transform(lambda x: x.mean())
test_df.loc[test_df['Healthcare_1'].isna(), 'Healthcare_1'] =\
train_df.groupby(['Helthcare_2'])['Healthcare_1'].transform(lambda x: x.mean())
test_df['Healthcare_1'] = test_df['Healthcare_1'].fillna(test_df.groupby('Helthcare_2')['Healthcare_1'].transform('mean'))
train_df['Rooms'].value_counts()
train_df.loc[(train_df['Rooms'] > 5)|(train_df['Rooms'] == 0), 'Rooms'] = train_df['Rooms'].mode()[0]
test_df.loc[(test_df['Rooms'] > 5)|(test_df['Rooms'] == 0), 'Rooms'] = train_df['Rooms'].mode()[0] # !!
train_df.loc[(train_df['KitchenSquare'] > 150) | \
(train_df['KitchenSquare'] > train_df['Square']), :]
train_df.loc[(train_df['KitchenSquare'] > 150) | \
(train_df['KitchenSquare'] > train_df['Square']),
'KitchenSquare'] = train_df['KitchenSquare'].median()
test_df.loc[(test_df['KitchenSquare'] > 150) | \
(test_df['KitchenSquare'] > test_df['Square']),
'KitchenSquare'] = train_df['KitchenSquare'].median()
train_df.loc[(train_df['LifeSquare'] > 400), 'LifeSquare'] = \
train_df['LifeSquare'].median()
test_df.loc[(test_df['LifeSquare'] > 400), 'LifeSquare'] = \
train_df['LifeSquare'].median()
train_df.loc[(train_df['Square'] > 400) | (train_df['Square'] < 10), 'Square'] = \
train_df['Square'].median()
test_df.loc[(test_df['Square'] > 400) | (test_df['Square'] < 10), 'Square'] = \
train_df['Square'].median()
train_df[['Square', 'LifeSquare', 'KitchenSquare']].describe()
train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor'] = train_df['HouseFloor'].mode()[0]
test_df.loc[test_df['HouseFloor'] == 0, 'HouseFloor'] = train_df['HouseFloor'].mode()[0]
train_df.loc[train_df['HouseFloor'] > 98, 'HouseFloor'] = train_df['HouseFloor'].mode()[0]
test_df.loc[test_df['HouseFloor'] > 98, 'HouseFloor'] = train_df['HouseFloor'].mode()[0]
floor_outliers_train = train_df.loc[train_df['Floor'] > train_df['HouseFloor']].index
floor_outliers_test = test_df.loc[test_df['Floor'] > test_df['HouseFloor']].index
train_df.loc[floor_outliers_train, 'Floor'] = \
train_df.loc[floor_outliers_train, 'HouseFloor'].apply(lambda x: random.randint(1, x))
test_df.loc[floor_outliers_test, 'Floor'] = \
test_df.loc[floor_outliers_test, 'HouseFloor'].apply(lambda x: random.randint(1, x))
train_df.loc[train_df['HouseYear'] > 2020, 'HouseYear'] = 2011
print(train_df['DistrictId'].nunique(), ' уникальных значений в train')
print(test_df['DistrictId'].nunique(), ' уникальных значений в test')
district_size = train_df['DistrictId'].value_counts().reset_index()\
.rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
district_size
districts_popular = district_size.loc[district_size['DistrictSize'] > 100, 'DistrictId'].tolist()
district_size.loc[~district_size['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
district_size = district_size.groupby('DistrictId')['DistrictSize'].agg(
DistrictSize='median')
district_size.reset_index(level='DistrictId', inplace=True)
district_size
train_df.loc[~train_df['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
test_df.loc[~test_df['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
train_df = train_df.merge(district_size, on='DistrictId', how='left').set_index(train_df.index)
test_df = test_df.merge(district_size, on='DistrictId', how='left').set_index(test_df.index)
train_df.head()
train_df['PriceOneRoom'] = train_df['Price'] / train_df['Rooms']
price_by_district = train_df.groupby(['DistrictId'], as_index=False)\
.agg({'PriceOneRoom':'median'})\
.rename(columns={'PriceOneRoom':'PriceOneRoomByDistrict'})
price_by_district
train_df = train_df.merge(price_by_district, on=['DistrictId'], how='left').set_index(train_df.index)
test_df = test_df.merge(price_by_district, on=['DistrictId'], how='left').set_index(test_df.index)
test_df.columns
train_df.columns
train_df = train_df.drop(['PriceOneRoom'], axis=1)
train_df['RoomSquare'] = train_df['Square'] / train_df['Rooms']
test_df['RoomSquare'] = test_df['Square'] / test_df['Rooms']
train_df.loc[train_df['Floor'] <= 5, 'Floor_cat'] = 1
train_df.loc[(train_df['Floor'] > 5) & (train_df['Floor'] <= 17), 'Floor_cat'] = 2
train_df.loc[train_df['Floor'] > 17, 'Floor_cat'] = 3
test_df.loc[test_df['Floor'] <= 5, 'Floor_cat'] = 1
test_df.loc[(test_df['Floor'] > 5) & (test_df['Floor'] <= 17), 'Floor_cat'] = 2
test_df.loc[test_df['Floor'] > 17, 'Floor_cat'] = 3
train_df['Floor_cat'].value_counts()
labels = [1, 2, 3]
train_df['Floor_cat_qcut'] = pd.qcut(train_df['Floor'], q=3, labels=labels)
test_df['Floor_cat_qcut'] = pd.qcut(test_df['Floor'], q=3, labels=labels)
train_df['Floor_cat_qcut'].value_counts()
train_df.loc[train_df['HouseFloor'] <= 5, 'HouseFloor_cat'] = 1
train_df.loc[(train_df['HouseFloor'] > 5) & (train_df['HouseFloor'] <= 17), 'HouseFloor_cat'] = 2
train_df.loc[train_df['HouseFloor'] > 17, 'HouseFloor_cat'] = 3
test_df.loc[test_df['HouseFloor'] <= 5, 'HouseFloor_cat'] = 1
test_df.loc[(test_df['HouseFloor'] > 5) & (test_df['HouseFloor'] <= 17), 'HouseFloor_cat'] = 2
test_df.loc[test_df['HouseFloor'] > 17, 'HouseFloor_cat'] = 3
train_df['HouseFloor_cat'].value_counts()
train_df[['Floor_cat', 'HouseFloor_cat']] = train_df[['Floor_cat', 'HouseFloor_cat']].astype(int)
test_df[['Floor_cat', 'HouseFloor_cat']] = test_df[['Floor_cat', 'HouseFloor_cat']].astype(int)
feature_names = train_df.columns
feature_names.tolist()
target_names = ['Price']
feature_names = ['DistrictId',
'Rooms',
'Square',
'LifeSquare',
'KitchenSquare',
'Floor',
'HouseFloor',
'HouseYear',
'Ecology_1',
'Ecology_2',
'Ecology_3',
'Social_1',
'Social_2',
'Social_3',
'Healthcare_1',
'Helthcare_2',
'Shops_1',
'Shops_2',
'DistrictSize',
'PriceOneRoomByDistrict',
'RoomSquare',
'Floor_cat',
'Floor_cat_qcut',
'HouseFloor_cat']
X = train_df[feature_names]
y = train_df[target_names]
forest = RandomForestRegressor(random_state=56)
forest_best = RandomForestRegressor(max_depth=15,
max_features=5,
random_state=56)
forest_best.fit(X, y)
y_pred = forest_best.predict(test_df[feature_names])
y_pred
preds = pd.DataFrame()
preds['Id'] = test_df.index
preds['Price'] = y_pred
preds.head()
preds.to_csv('Gladkikh_predictions_1.csv', index=False)