%%time 

# Important library for many geopython libraries
!apt install gdal-bin python-gdal python3-gdal 
# Install Folium for Geographic data visualization
!pip install folium
# Install plotlyExpress
!pip install plotly_express

Unable to locate an executable at "/Users/asmaa/Library/Java/JavaVirtualMachines/openjdk-17.0.2/Contents/Home/bin/apt" (-1)
Traceback (most recent call last):
  File "/usr/local/bin/pip", line 11, in <module>
    load_entry_point('pip==21.1.1', 'console_scripts', 'pip')()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 489, in load_entry_point
    return get_distribution(dist).load_entry_point(group, name)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 2843, in load_entry_point
    return ep.load()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 2434, in load
    return self.resolve()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 2440, in resolve
    module = __import__(self.module_name, fromlist=['__name__'], level=0)
  File "/Library/Python/2.7/site-packages/pip-21.1.1-py2.7.egg/pip/__init__.py", line 1, in <module>
    from typing import List, Optional
ImportError: No module named typing
Traceback (most recent call last):
  File "/usr/local/bin/pip", line 11, in <module>
    load_entry_point('pip==21.1.1', 'console_scripts', 'pip')()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 489, in load_entry_point
    return get_distribution(dist).load_entry_point(group, name)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 2843, in load_entry_point
    return ep.load()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 2434, in load
    return self.resolve()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/pkg_resources/__init__.py", line 2440, in resolve
    module = __import__(self.module_name, fromlist=['__name__'], level=0)
  File "/Library/Python/2.7/site-packages/pip-21.1.1-py2.7.egg/pip/__init__.py", line 1, in <module>
    from typing import List, Optional
ImportError: No module named typing
CPU times: user 80.8 ms, sys: 52.2 ms, total: 133 ms
Wall time: 2.83 s


import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import folium
import numpy as np
import seaborn as sns 
import scipy as sp
from scipy.stats import shapiro 
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder


#please uncomment only one of the two files below to be read into the project. 

df = pd.read_csv('../DataSets/riga_4689.csv')
# df = pd.read_csv('../DataSets/riga_merged_8846.csv')



# Printing top 5 rows
df.head(5)


# Checking total amount of rows in given dataset
len(df)

4689


plt.figure(figsize = (10, 6))
plt.title("op_type feature", fontsize=14)
ax = sns.countplot(x = 'op_type',data = df)


# Grouping by operation type and getting statistics within groups
df_by_op_type = df.groupby('op_type')
df_by_op_type.describe()


df_filt = df[~df['op_type'].isin(['Change', 'Other', 'Buying', 'Renting'])]
len(df_filt)

4205


df_filt.district.unique()

array(['Purvciems', 'Ziepniekkalns', 'centrs', 'Ķengarags', 'Teika',
       'Imanta', 'Jugla', 'Mangaļi', 'Mežciems', 'Maskavas priekšpilsēta',
       'Zolitūde', 'Bolderāja', 'Grīziņkalns', 'Dreiliņi', 'Pļavnieki',
       'Dzegužkalns', 'Ķīpsala', 'Krasta r-ns', 'Klīversala',
       'Vecmīlgrāvis', 'Āgenskalns', 'Zasulauks', 'Mežaparks',
       'Čiekurkalns', 'Sarkandaugava', 'Daugavgrīva', 'Iļģuciems',
       'Berģi', 'Šķirotava', 'Dārzciems', 'Vecrīga', 'Bieriņi', 'Brekši',
       'Kundziņsala', 'Jaunciems', 'Torņakalns', 'Mangaļsala',
       'Šampēteris-Pleskodāle', 'Bukulti', 'Voleri', nan, 'Vecdaugava',
       'Aplokciems', 'Katlakalns', 'Kleisti', 'Vecāķi', 'Buļļi',
       'Jaunmīlgrāvis', 'VEF'], dtype=object)


for col in ['floor', 'total_floors']:
    print(col, ":", sorted(df_filt[col].unique()))

floor : [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 20.0, 21.0, 23.0, 24.0]
total_floors : [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 22.0, 23.0, 24.0, 25.0, 27.0, 29.0, 30.0]


 for col in ['house_seria', 'house_type', 'condition']:
    print(col, ":", df_filt[col].unique())

house_seria : ['LT proj.' '602.' 'P. kara' 'Jaun.' 'Specpr.' 'Hrušč.' '119.' 'M. ģim.'
 'Renov.' '103.' 'Priv. m.' '467.' 'Staļina' '104.' 'Čehu pr.']
house_type : ['Brick-Panel' 'Panel' 'Wood' 'Masonry' 'Brick' 'Panel-Brick']
condition : ['All amenities' 'Partial amenities' 'Without amenities']


viz=df_filt.plot(kind='scatter', x='lon', y='lat', alpha=0.4, figsize=(10,10))
viz.legend()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

<matplotlib.legend.Legend at 0x7fe4848192b0>


wrong = df_filt[(df_filt['lat'] < 55)|(df_filt['lat'] > 58)|(df_filt['lon'] < 24)|(df_filt['lon'] > 25)]
len(wrong)

47


df_filt = df_filt[~((df_filt['lat'] < 55)|(df_filt['lat'] > 58)|(df_filt['lon'] < 24)|(df_filt['lon']>25))]
viz=df_filt.plot(kind='scatter', x='lon', y='lat', alpha=0.4, figsize=(10,10))
viz.legend()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

<matplotlib.legend.Legend at 0x7fe4854f3be0>


# Define helper function to plot over Riga map
def plot_on_riga_map(data_frame): 
    riga_map = folium.Map(
        location=[56.946285, 24.105078],
        tiles='cartodbpositron',
        zoom_start=12,
    )
    data_frame.apply(lambda row:folium.Marker(location=[row["lat"], row["lon"]]).add_to(riga_map), axis=1)
    return riga_map


plot_on_riga_map(df_filt[~df_filt['lon'].isna()].head(500))


def missing(df):
    df_missing = pd.DataFrame(df.isna().sum().sort_values(ascending = False), columns = ['missing_count'])
    df_missing['missing_share'] = df_missing.missing_count / len(df)
    return df_missing


missing(df_filt)


df_filt.loc[df_filt['district'].isna()]


df_filt.loc[df_filt.street.str.startswith('Ogļu')]


df_filt.loc[df_filt.street == 'Ogļu 32', 'district'] = 'Ķīpsala'

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1817: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


df_filt.loc[df_filt.street.str.startswith('Pupuku')]


df_filt.loc[df_filt.street == 'Pupuku iela 9', 'district'] = 'Valdlauči'

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1817: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


missing(df_filt)


df_filt.rooms.unique()

array(['1', '2', '3', '4', '5', 'Citi', '6', nan], dtype=object)


df_filt.loc[df_filt['rooms'] == 'Citi', 'rooms'] = None
df_filt.loc[df_filt['rooms'].isna()]

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1817: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


# Filter out only valid rows with rooms
df_with_rooms = df_filt.loc[~df_filt['rooms'].isna()]
# Calculate average dataset room area
average_room_area = (df_with_rooms['area']/df_with_rooms['rooms'].astype('int64')).mean()
average_room_area

27.372388365918415


# Very rough room count estimation using average dataset room area
def estimate_room_count_rough(area):
    return np.ceil(area / average_room_area)


# Delicate estimation: finding out room count that occurs most among dataset samples of similar area
# If no samples found of a similar area, fallback to rough estimation
def estimate_room_count(area, delta = 10):
    # Defining lower and upper bounds to find similar area
    area_lo = area - delta
    area_up = area + delta
    try:
        df_similar_by_area = df_with_rooms[(df_with_rooms['area'] > area_lo) & (df_with_rooms['area'] < area_up)]
        room_values = df_similar_by_area["rooms"].values.flatten()
        return pd.value_counts(room_values).idxmax()
    except:
        return estimate_room_count_rough(area)


# Inputing helper, that sets most probable rooms value
def impute_most_probable_room_value(index):
    df_filt.loc[index, 'rooms'] = estimate_room_count(df_filt.loc[index].area)


# Fix missing rooms by imputing most probable room values
df_filt.loc[df_filt['rooms'].isna()].apply(lambda row: impute_most_probable_room_value(row.name), axis=1)

79      None
315     None
384     None
535     None
605     None
884     None
1579    None
1769    None
2010    None
2039    None
2212    None
2622    None
3460    None
3511    None
3662    None
dtype: object


df_filt.loc[df_filt['rooms'].isna()]


# Change column type
df_filt.rooms= df_filt.rooms.astype('int64')

# Verify
df_filt.rooms.unique()

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value

array([ 1,  2,  3,  4,  5,  6, 13, 16])


missing(df_filt)


df_filt.dtypes

op_type          object
district         object
street           object
rooms             int64
area            float64
floor           float64
total_floors    float64
house_seria      object
house_type       object
condition        object
price           float64
lat             float64
lon             float64
dtype: object


# Evaluate district vocabulary
len(df_filt.district.unique())

48


df_filt.district.unique()

array(['Purvciems', 'Ziepniekkalns', 'centrs', 'Ķengarags', 'Teika',
       'Imanta', 'Jugla', 'Mangaļi', 'Mežciems', 'Maskavas priekšpilsēta',
       'Zolitūde', 'Bolderāja', 'Grīziņkalns', 'Dreiliņi', 'Pļavnieki',
       'Dzegužkalns', 'Ķīpsala', 'Krasta r-ns', 'Klīversala',
       'Vecmīlgrāvis', 'Āgenskalns', 'Zasulauks', 'Mežaparks',
       'Čiekurkalns', 'Sarkandaugava', 'Daugavgrīva', 'Iļģuciems',
       'Berģi', 'Šķirotava', 'Dārzciems', 'Vecrīga', 'Bieriņi', 'Brekši',
       'Kundziņsala', 'Jaunciems', 'Torņakalns', 'Mangaļsala',
       'Šampēteris-Pleskodāle', 'Bukulti', 'Voleri', 'Vecdaugava',
       'Aplokciems', 'Katlakalns', 'Kleisti', 'Vecāķi', 'Jaunmīlgrāvis',
       'VEF', 'Valdlauči'], dtype=object)


#ordinal encoding  
ord_enc = OrdinalEncoder()
df_filt["district_code"] = ord_enc.fit_transform(df_filt[["district"]])
df_filt[["district", "district_code"]].head(11)
df_filt.district_code = df_filt.district_code.astype('int64')

/var/folders/08/bs9nsl150qq7hdrkcrbsjnj40000gp/T/ipykernel_78940/2621536393.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filt["district_code"] = ord_enc.fit_transform(df_filt[["district"]])
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


df_filt


#ordinal encoding of condition column 

for index, row in df_filt.iterrows():
 cond = row['condition']
 if cond == 'All amenities':
   df_filt.loc[index, 'condition'] = 3
 elif cond == 'Partial amenities':
   df_filt.loc[index, 'condition'] = 2
 elif cond == 'Without amenities':
   df_filt.loc[index, 'condition']= 1
 else:
   df_filt.loc[index, 'condition']= 0

 
# Change column type
df_filt.condition= df_filt.condition.astype('int64')
df_filt

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/indexing.py:1817: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


"""for our visualization purpose will fit line using seaborn library only for bmi as 
independent variable and charges as dependent variable"""

sns.lmplot(x='area',y='price',data=df_filt,aspect=2,height=6,hue='op_type')
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('Price Vs Area');


corr = df_filt.corr(method='spearman')
sns.heatmap(corr,cmap= 'rocket', annot=True )

<AxesSubplot:>


df_sale = df_filt[df_filt['op_type'] == 'For sale']
df_rent = df_filt[df_filt['op_type'] == 'For rent']
print("The length of rent dataset is :", len(df_rent))
print("The length of sale dataset is :", len(df_sale))

The length of rent dataset is : 1507
The length of sale dataset is : 2651


sns.lmplot(x='area',y='price',data=df_sale,aspect=2,height=6)
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('Price Vs Area For Sale');


sns.lmplot(x='area',y='price',data=df_rent,aspect=2,height=6)
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('Price Vs Area For Rent');


# Lower triangular matrix https://en.wikipedia.org/wiki/Triangular_matrix
mask1 = np.triu(np.ones_like(df_sale.corr()))
mask2 = np.triu(np.ones_like(df_rent.corr()))

plt.figure(figsize=(15, 7))

plt.subplot(121)
corr_sale = df_sale.corr(method='spearman')
sns.heatmap(corr_sale,cmap= 'Wistia', annot=True, mask=mask1)
plt.title("For Sale")

plt.subplot(122)
corr_rent = df_rent.corr(method='spearman')
sns.heatmap(corr_rent,cmap= 'Wistia', annot=True, mask=mask2)
plt.title("For Rent")

plt.tight_layout()


f= plt.figure(figsize=(12,4))

ax=f.add_subplot(121)
sns.distplot(df_rent['price'],bins=50,color='r',ax=ax)
ax.set_title('Distribution of prices')

ax=f.add_subplot(122)
sns.distplot(np.log10(df_rent['price']),bins=40,color='b',ax=ax)
ax.set_title('Distribution of prices in $log$ sacle')
ax.set_xscale('log');

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


f= plt.figure(figsize=(12,4))

ax=f.add_subplot(121)
sns.distplot(df_sale['price'],bins=50,color='r',ax=ax)
ax.set_title('Distribution of prices')

ax=f.add_subplot(122)
sns.distplot(np.log10(df_sale['price']),bins=40,color='b',ax=ax)
ax.set_title('Distribution of prices in $log$ sacle')
ax.set_xscale('log');

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


# f = plt.figure(figsize=(25,6))
# ax = f.add_subplot(121)
# sns.violinplot(x='district', y='price',data=df_rent,palette='rocket',ax=ax)
# ax.set_title('Violin plot of Prices vs District For Rent')

f = plt.figure(figsize=(14,6))
ax = f.add_subplot(121)
sns.violinplot(x='condition', y='price',data=df_sale,palette='magma',ax=ax)
ax.set_title('Violin plot of Prices vs Condition For Sales');

ax = f.add_subplot(122)
sns.violinplot(x='condition', y='price',data=df_rent,palette='magma',ax=ax)
ax.set_title('Violin plot of Prices vs Condition For Rent');


plt.figure(figsize=(10,6))
sns.barplot(x = 'floor', y = 'price', data = df_rent , palette="Blues_d")
plt.title('Price Vs Floor Comparison For Rent')

Text(0.5, 1.0, 'Price Vs Floor Comparison For Rent')


plt.figure(figsize=(10,6))
sns.barplot(x = 'floor', y = 'price', data = df_sale , palette="Blues_d")
plt.title('Price Vs Floor Comparison For Sale')

Text(0.5, 1.0, 'Price Vs Floor Comparison For Sale')


plt.figure(figsize=(10,6))
sns.barplot(x = 'house_seria', y = 'price', data = df_sale , palette="rocket")
plt.title('Price Vs house_seria Comparison For Sale')

Text(0.5, 1.0, 'Price Vs house_seria Comparison For Sale')


plt.figure(figsize=(10,6))
sns.barplot(x = 'house_seria', y = 'price', data = df_rent , palette="rocket")
plt.title('Price Vs house_seria Comparison For Rent')

Text(0.5, 1.0, 'Price Vs house_seria Comparison For Rent')


plt.figure(figsize=(10,6))
sns.barplot(x = 'house_type', y = 'price', data = df_sale , palette="rocket")
plt.title('Price Vs house_type Comparison For Sale')

Text(0.5, 1.0, 'Price Vs house_type Comparison For Sale')


plt.figure(figsize=(10,6))
sns.barplot(x = 'house_type', y = 'price', data = df_rent , palette="rocket")
plt.title('Price Vs house_type Comparison For Rent')

Text(0.5, 1.0, 'Price Vs house_type Comparison For Rent')


plt.figure(figsize=(10,15))
sns.barplot(x = 'price', y = 'district', data = df_filt , palette="rocket",orient="h")
plt.title('Price Vs District Comparison')

Text(0.5, 1.0, 'Price Vs District Comparison')


# to check where categorical data are
df_filt.dtypes

op_type           object
district          object
street            object
rooms              int64
area             float64
floor            float64
total_floors     float64
house_seria       object
house_type        object
condition          int64
price            float64
lat              float64
lon              float64
district_code      int64
dtype: object


# Dummy variable
categorical_columns = [ 'house_type','house_seria']
df_encode = pd.get_dummies(data = df_filt, prefix = 'OHE', prefix_sep='_',columns = categorical_columns,drop_first =True,dtype='int8')
df_encode


# Lets verify the dummay variable process
print('Columns in original data frame:\n',df_filt.columns.values)
print('\nNumber of rows and columns in the dataset:',df_filt.shape)
print('\nColumns in data frame after encoding dummy variable:\n',df_encode.columns.values)
print('\nNumber of rows and columns in the dataset:',df_encode.shape)

Columns in original data frame:
 ['op_type' 'district' 'street' 'rooms' 'area' 'floor' 'total_floors'
 'house_seria' 'house_type' 'condition' 'price' 'lat' 'lon'
 'district_code']

Number of rows and columns in the dataset: (4158, 14)

Columns in data frame after encoding dummy variable:
 ['op_type' 'district' 'street' 'rooms' 'area' 'floor' 'total_floors'
 'condition' 'price' 'lat' 'lon' 'district_code' 'OHE_Brick-Panel'
 'OHE_Masonry' 'OHE_Panel' 'OHE_Panel-Brick' 'OHE_Wood' 'OHE_104.'
 'OHE_119.' 'OHE_467.' 'OHE_602.' 'OHE_Hrušč.' 'OHE_Jaun.' 'OHE_LT proj.'
 'OHE_M. ģim.' 'OHE_P. kara' 'OHE_Priv. m.' 'OHE_Renov.' 'OHE_Specpr.'
 'OHE_Staļina' 'OHE_Čehu pr.']

Number of rows and columns in the dataset: (4158, 31)


sale_dataset = df_encode[df_encode['op_type'] == 'For sale']
rent_dataset = df_encode[df_encode['op_type'] == 'For rent']
print("The length of rent dataset is :", len(rent_dataset))
print("The length of sale dataset is :", len(sale_dataset))

The length of rent dataset is : 1507
The length of sale dataset is : 2651


cols = ['op_type','street','lat','lon','district','total_floors']
rent_dataset.drop(axis=1,columns=cols, inplace = True)
sale_dataset.drop(axis=1,columns=cols, inplace = True)

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


rent_dataset


## Log transform 
sale_dataset['price'] = np.log(sale_dataset['price'])
rent_dataset['price'] = np.log(rent_dataset['price'])

/var/folders/08/bs9nsl150qq7hdrkcrbsjnj40000gp/T/ipykernel_78940/874285860.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sale_dataset['price'] = np.log(sale_dataset['price'])
/var/folders/08/bs9nsl150qq7hdrkcrbsjnj40000gp/T/ipykernel_78940/874285860.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rent_dataset['price'] = np.log(rent_dataset['price'])


X_sale = sale_dataset.drop('price',axis=1) # Independet variable
y_sale = sale_dataset['price'] # dependent variable

X_train_sale, X_test_sale, y_train_sale, y_test_sale = train_test_split(X_sale,y_sale,test_size=0.3,random_state=23)
# train vars, test vars  , train results, test results


X_rent = rent_dataset.drop('price',axis=1) # Independet variable
y_rent = rent_dataset['price'] # dependent variable

X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(X_rent,y_rent,test_size=0.3,random_state=23)


X_rent


X_sale


# Was made for debugging :)

mask1 = np.triu(np.ones_like(rent_dataset.corr()))
mask2 = np.triu(np.ones_like(sale_dataset.corr()))

plt.figure(figsize=(20, 15))

plt.subplot(211)
corr_sale = rent_dataset.corr(method='spearman')
sns.heatmap(corr_sale,cmap= 'Wistia', annot=True, mask=mask1)
plt.title("For Rent")

plt.subplot(212)
corr_rent = sale_dataset.corr(method='spearman')
sns.heatmap(corr_rent,cmap= 'Wistia', annot=True, mask=mask2)
plt.title("For Sale")

plt.tight_layout()


# FOR SALE
# Step 1: add x0 =1 to dataset
X_train_sale_0 = np.c_[np.ones((X_train_sale.shape[0],1)),X_train_sale]
X_test_sale_0 = np.c_[np.ones((X_test_sale.shape[0],1)),X_test_sale]


# FOR RENT
# Step 1: add x0 =1 to dataset
X_train_rent_0 = np.c_[np.ones((X_train_rent.shape[0],1)),X_train_rent]
X_test_rent_0 = np.c_[np.ones((X_test_rent.shape[0],1)),X_test_rent]


np.c_[np.ones((X_train_sale.shape[0],1)),X_train_sale]

array([[  1.,   4.,  82., ...,   0.,   0.,   0.],
       [  1.,   1.,  33., ...,   0.,   0.,   0.],
       [  1.,   4.,  90., ...,   0.,   0.,   0.],
       ...,
       [  1.,   3.,  88., ...,   0.,   0.,   0.],
       [  1.,   2.,  50., ...,   0.,   0.,   0.],
       [  1.,   3., 124., ...,   0.,   0.,   0.]])


#FOR SALE
# Step2: build model
theta = np.matmul(np.linalg.inv( np.matmul(X_train_sale_0.T,X_train_sale_0) ), np.matmul(X_train_sale_0.T,y_train_sale)) 
# FOR RENT
# Step2: build model
theta2 = np.matmul(np.linalg.inv( np.matmul(X_train_rent_0.T,X_train_rent_0) ), np.matmul(X_train_rent_0.T,y_train_rent))


theta

array([ 8.23327489e+00,  1.19353619e-01,  8.72688387e-03,  1.82815748e-02,
        5.47371337e-01,  3.66346774e-03,  4.45615249e-02,  1.15863576e-01,
        1.04623128e-01,  8.21528869e-02, -5.30865578e-01,  7.44142076e-02,
       -7.56865202e-03, -1.16340608e-01, -1.42720842e-02, -7.08958402e-02,
        5.71605407e-01, -1.56296802e-01, -3.83246321e-01,  2.56042552e-01,
        1.52013824e-01,  6.13465758e-01,  2.33008354e-02, -9.26333905e-02,
        8.93695681e-02])


#FOR SALE :  The parameters for linear regression model 
parameter_sale = ['theta_'+str(i) for i in range(X_train_sale_0.shape[1])]
columns_sale = ['intersect:x_0=1'] + list(X_sale.columns.values) 
parameter_df_sale = pd.DataFrame({'Sale : Parameter':parameter_sale,'Columns':columns_sale,'theta':theta})

# FOR RENT : The parameters for linear regression model
parameter_rent = ['theta2_'+str(i) for i in range(X_train_rent_0.shape[1])]
columns_rent = ['intersect:x_0=1'] + list(X_rent.columns.values)
parameter_df_rent = pd.DataFrame({'Rent : Parameter':parameter_rent,'Columns':columns_rent,'theta':theta2})


# Scikit Learn module

lin_reg = LinearRegression()
lin_reg.fit(X_train_sale,y_train_sale) # Note: x_0 =1 is no need to add, sklearn will take care of it.

#Parameter
sk_theta = [lin_reg.intercept_]+list(lin_reg.coef_)
parameter_df_sale = parameter_df_sale.join(pd.Series(sk_theta, name='Sklearn_theta'))
parameter_df_sale


lin_reg2 = LinearRegression()
lin_reg2.fit(X_train_rent,y_train_rent) # Note: x_0 =1 is no need to add, sklearn will take care of it.

#Parameter
sk_theta2 = [lin_reg2.intercept_]+list(lin_reg2.coef_)
parameter_df_rent = parameter_df_rent.join(pd.Series(sk_theta2, name='Sklearn_theta'))
parameter_df_rent


# source : https://stackoverflow.com/a/50747938
# More regression metrices : https://scikit-learn.org/stable/modules/classes.html#regression-metrics

regressor = LinearRegression()
regressor.fit(X_train_sale,y_train_sale)
r2_score = regressor.score(X_test_sale,y_test_sale)
print("The Sale dataset score R-squared (R2):",r2_score*100,'%')
regressor = LinearRegression()
regressor.fit(X_train_rent,y_train_rent)
r2_score = regressor.score(X_test_rent,y_test_rent)
print("The Rent dataset score R-squared (R2):",r2_score*100,'%')

The Sale dataset score R-squared (R2): 80.55019512215034 %
The Rent dataset score R-squared (R2): 25.692329276494863 %


regressor

LinearRegression()


# Normal equation
y_pred_norm =  np.matmul(X_test_sale_0,theta)


#Evaluvation: MSE
J_mse = np.sum((y_pred_norm - y_test_sale)**2)/ X_test_sale_0.shape[0]

# R_square 
sse = np.sum((y_pred_norm - y_test_sale)**2)
sst = np.sum((y_test_sale - y_test_sale.mean())**2)
R_square = 1 - (sse/sst)
print('The Mean Square Error(MSE) or J(theta) is: ',J_mse)
print('R square obtain for normal equation method is :',R_square)

The Mean Square Error(MSE) or J(theta) is:  0.11361780808787787
R square obtain for normal equation method is : 0.8055019512214718


# Normal equation
y_pred_norm2 =  np.matmul(X_test_rent_0,theta2)


#Evaluvation: MSE
J_mse2 = np.sum((y_pred_norm2 - y_test_rent)**2)/ X_test_rent_0.shape[0]

# R_square 
sse2 = np.sum((y_pred_norm2 - y_test_rent)**2)
sst2 = np.sum((y_test_rent - y_test_rent.mean())**2)
R_square2 = 1 - (sse2/sst2)
print('The Mean Square Error(MSE) or J(theta) is: ',J_mse2)
print('R square obtain for normal equation method is :',R_square2)

The Mean Square Error(MSE) or J(theta) is:  0.6729427915952074
R square obtain for normal equation method is : 0.25692329276492354


# sklearn regression module
y_pred_sk = lin_reg.predict(X_test_sale)

#Evaluvation: MSE
J_mse_sk = mean_squared_error(y_pred_sk, y_test_sale)

# R_square
R_square_sk = lin_reg.score(X_test_sale,y_test_sale)
print('The Mean Square Error(MSE) or J(theta) is: ',J_mse_sk)
print('R square obtain for scikit learn library is :',R_square_sk)

The Mean Square Error(MSE) or J(theta) is:  0.11361780808785944
R square obtain for scikit learn library is : 0.8055019512215034


# sklearn regression module
y_pred_sk2 = lin_reg.predict(X_test_rent)

#Evaluvation: MSE
J_mse_sk2 = mean_squared_error(y_pred_sk2, y_test_rent)

# R_square
R_square_sk2 = lin_reg.score(X_test_rent,y_test_rent)
print('The Mean Square Error(MSE) or J(theta) is: ',J_mse_sk2)
print('R square obtain for scikit learn library is :',R_square_sk2)

The Mean Square Error(MSE) or J(theta) is:  29.71063959374301
R square obtain for scikit learn library is : -31.807074412421535


# Check for Linearity
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(y_test_sale,y_pred_sk,ax=ax,color='r')
ax.set_title('Check for Linearity:\n Actual Vs Predicted value')

# Check for Residual normality & mean
ax = f.add_subplot(122)
sns.distplot((y_test_sale - y_pred_sk),ax=ax,color='b')
ax.axvline((y_test_sale - y_pred_sk).mean(),color='k',linestyle='--')
ax.set_title('Check for Residual normality & mean: \n Residual eror');

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


# Check for Linearity
f = plt.figure(figsize=(14,5))
ax = f.add_subplot(121)
sns.scatterplot(y_test_rent,y_pred_sk2,ax=ax,color='r')
ax.set_title('Check for Linearity:\n Actual Vs Predicted value')

# Check for Residual normality & mean
ax = f.add_subplot(122)
sns.distplot((y_test_rent - y_pred_sk2),ax=ax,color='b')
ax.axvline((y_test_rent - y_pred_sk2).mean(),color='k',linestyle='--')
ax.set_title('Check for Residual normality & mean: \n Residual eror');

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


# Check for Multivariate Normality
# Quantile-Quantile plot 
f,ax = plt.subplots(1,2,figsize=(14,6))

_,(_,_,r)= sp.stats.probplot((y_test_sale - y_pred_sk),fit=True,plot=ax[0])
ax[0].set_title('Check for Multivariate Normality: \nQ-Q Plot')

#Check for Homoscedasticity
sns.scatterplot(y = (y_test_sale - y_pred_sk), x= y_pred_sk, ax = ax[1],color='r') 
ax[1].set_title('Check for Homoscedasticity: \nResidual Vs Predicted');


# Check for Multivariate Normality
# Quantile-Quantile plot 
f,ax = plt.subplots(1,2,figsize=(14,6))
_,(_,_,r)= sp.stats.probplot((y_test_rent - y_pred_sk2),fit=True,plot=ax[0])
ax[0].set_title('Check for Multivariate Normality: \nQ-Q Plot')

#Check for Homoscedasticity
sns.scatterplot(y = (y_test_rent - y_pred_sk2), x= y_pred_sk2, ax = ax[1],color='r') 
ax[1].set_title('Check for Homoscedasticity: \nResidual Vs Predicted');


# Check for Multicollinearity
#Variance Inflation Factor
VIF = 1/(1- R_square_sk)
VIF

5.141439753664811


# Check for Multicollinearity
#Variance Inflation Factor
VIF = 1/(1- R_square_sk2)
VIF

0.03048123058547934


!jupyter nbconvert --output='index' --output-dir='../' --to html Riga_RealEstate.ipynb


!jupyter nbconvert help

This application is used to convert notebook files (*.ipynb)
        to various other formats.

        WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.

Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
    Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
    Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
    read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
    Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
    Write notebook output to stdout instead of files.
    Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
    Run nbconvert in place, overwriting the existing notebook (only 
            relevant when converting to notebook format)
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
    Clear output of current file and save in place, 
            overwriting the existing notebook.
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
    Exclude input and output prompts from converted document.
    Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
    Exclude input cells and output prompts from converted document. 
            This mode is ideal for generating code-free reports.
    Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
    Whether to allow downloading chromium if no suitable version is found on the system.
    Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
    Disable chromium security sandbox when converting to PDF..
    Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
    Shows code input. This is flag is only useful for dejavu users.
    Equivalent to: [--TemplateExporter.exclude_input=False]
--log-level=<Enum>
    Set the log level by value or name.
    Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
    Default: 30
    Equivalent to: [--Application.log_level]
--config=<Unicode>
    Full path of a config file.
    Default: ''
    Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
    The export format to be used, either one of the built-in formats
            ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
            or a dotted object name that represents the import path for an
            ``Exporter`` class
    Default: ''
    Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
    Name of the template to use
    Default: ''
    Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
    Name of the template file to use
    Default: None
    Equivalent to: [--TemplateExporter.template_file]
--writer=<DottedObjectName>
    Writer class used to write the 
                                        results of the conversion
    Default: 'FilesWriter'
    Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
    PostProcessor class used to write the
                                        results of the conversion
    Default: ''
    Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
    overwrite base name use for output files.
                can only be used when converting one notebook at a time.
    Default: ''
    Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
    Directory to write output(s) to. Defaults
                                  to output to the directory of each notebook. To recover
                                  previous default behaviour (outputting to the current 
                                  working directory) use . as the flag value.
    Default: ''
    Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
    The URL prefix for reveal.js (version 3.x).
            This defaults to the reveal CDN, but can be any url pointing to a copy 
            of reveal.js. 
            For speaker notes to work, this must be a relative path to a local 
            copy of reveal.js: e.g., "reveal.js".
            If a relative path is given, it must be a subdirectory of the
            current directory (from which the server is run).
            See the usage documentation
            (https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
            for more details.
    Default: ''
    Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
    The nbformat version to write.
            Use this to downgrade notebooks.
    Choices: any of [1, 2, 3, 4]
    Default: 4
    Equivalent to: [--NotebookExporter.nbformat_version]

Examples
--------

    The simplest way to use nbconvert is

            > jupyter nbconvert mynotebook.ipynb --to html

            Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].

            > jupyter nbconvert --to latex mynotebook.ipynb

            Both HTML and LaTeX support multiple output templates. LaTeX includes
            'base', 'article' and 'report'.  HTML includes 'basic', 'lab' and 
            'classic'. You can specify the flavor of the format used.

            > jupyter nbconvert --to html --template lab mynotebook.ipynb

            You can also pipe the output to stdout, rather than a file

            > jupyter nbconvert mynotebook.ipynb --stdout

            PDF is generated via latex

            > jupyter nbconvert mynotebook.ipynb --to pdf

            You can get (and serve) a Reveal.js-powered slideshow

            > jupyter nbconvert myslides.ipynb --to slides --post serve

            Multiple notebooks can be given at the command line in a couple of 
            different ways:

            > jupyter nbconvert notebook*.ipynb
            > jupyter nbconvert notebook1.ipynb notebook2.ipynb

            or you can specify the notebooks list in a config file, containing::

                c.NbConvertApp.notebooks = ["my_notebook.ipynb"]

            > jupyter nbconvert --config mycfg.py

To see all available configurables, use `--help-all`.

	area								floor		...	lat		lon
	count	mean	std	min	25%	50%	75%	max	count	mean	...	75%	max	count	mean	std	min	25%	50%	75%	max
op_type
Buying	96.0	28.583333	35.514465	1.0	7.00	11.0	33.00	150.0	123.0	37.504065	...	46.314475	57.047428	381.0	11.496010	2.378281	11.048029	11.048029	11.048029	11.048029	24.225406
Change	10.0	57.350000	48.887655	21.5	35.75	39.0	56.25	190.0	12.0	4.333333	...	56.952481	56.986689	13.0	19.099330	6.625366	11.048029	11.048029	24.052744	24.136366	24.308342
For rent	1525.0	55.370807	34.460491	5.0	33.00	48.0	68.00	300.0	1525.0	3.722623	...	56.963246	59.347471	1451.0	23.496124	8.670202	-100.708194	24.097433	24.127667	24.152956	24.308342
For sale	2680.0	65.774899	35.954179	12.0	44.00	58.0	78.00	427.0	2680.0	3.526866	...	56.965763	57.146515	2549.0	23.604366	7.833637	-100.708194	24.095453	24.126845	24.178037	24.340866
Other	5.0	32.000000	16.077935	15.0	18.00	30.0	48.00	49.0	5.0	3.800000	...	56.949448	56.961863	5.0	24.132074	0.082839	24.003491	24.120599	24.125564	24.205359	24.205359
Renting	9.0	57.222222	16.029487	30.0	50.00	60.0	70.00	80.0	11.0	1.545455	...	46.314475	56.947288	83.0	11.522384	2.464521	11.048029	11.048029	11.048029	11.048029	24.308342

	Sale : Parameter	Columns	theta	Sklearn_theta
0	theta_0	intersect:x_0=1	8.233275	8.233275
1	theta_1	rooms	0.119354	0.119354
2	theta_2	area	0.008727	0.008727
3	theta_3	floor	0.018282	0.018282
4	theta_4	condition	0.547371	0.547371
5	theta_5	district_code	0.003663	0.003663
6	theta_6	OHE_Brick-Panel	0.044562	0.044562
7	theta_7	OHE_Masonry	0.115864	0.115864
8	theta_8	OHE_Panel	0.104623	0.104623
9	theta_9	OHE_Panel-Brick	0.082153	0.082153
10	theta_10	OHE_Wood	-0.530866	-0.530866
11	theta_11	OHE_104.	0.074414	0.074414
12	theta_12	OHE_119.	-0.007569	-0.007569
13	theta_13	OHE_467.	-0.116341	-0.116341
14	theta_14	OHE_602.	-0.014272	-0.014272
15	theta_15	OHE_Hrušč.	-0.070896	-0.070896
16	theta_16	OHE_Jaun.	0.571605	0.571605
17	theta_17	OHE_LT proj.	-0.156297	-0.156297
18	theta_18	OHE_M. ģim.	-0.383246	-0.383246
19	theta_19	OHE_P. kara	0.256043	0.256043
20	theta_20	OHE_Priv. m.	0.152014	0.152014
21	theta_21	OHE_Renov.	0.613466	0.613466
22	theta_22	OHE_Specpr.	0.023301	0.023301
23	theta_23	OHE_Staļina	-0.092633	-0.092633
24	theta_24	OHE_Čehu pr.	0.089370	0.089370

	Rent : Parameter	Columns	theta	Sklearn_theta
0	theta2_0	intersect:x_0=1	4.404334	4.404334
1	theta2_1	rooms	0.157118	0.157118
2	theta2_2	area	0.011365	0.011365
3	theta2_3	floor	0.008088	0.008088
4	theta2_4	condition	0.061772	0.061772
5	theta2_5	district_code	0.001707	0.001707
6	theta2_6	OHE_Brick-Panel	0.065635	0.065635
7	theta2_7	OHE_Masonry	0.064456	0.064456
8	theta2_8	OHE_Panel	-0.048111	-0.048111
9	theta2_9	OHE_Panel-Brick	-0.250155	-0.250155
10	theta2_10	OHE_Wood	-0.110566	-0.110566
11	theta2_11	OHE_104.	-0.424586	-0.424586
12	theta2_12	OHE_119.	-0.025135	-0.025135
13	theta2_13	OHE_467.	0.014099	0.014099
14	theta2_14	OHE_602.	-0.121360	-0.121360
15	theta2_15	OHE_Hrušč.	0.328976	0.328976
16	theta2_16	OHE_Jaun.	0.297087	0.297087
17	theta2_17	OHE_LT proj.	0.101907	0.101907
18	theta2_18	OHE_M. ģim.	0.057295	0.057295
19	theta2_19	OHE_P. kara	-0.002423	-0.002423
20	theta2_20	OHE_Priv. m.	0.155186	0.155186
21	theta2_21	OHE_Renov.	0.063212	0.063212
22	theta2_22	OHE_Specpr.	0.016322	0.016322
23	theta2_23	OHE_Staļina	0.267892	0.267892
24	theta2_24	OHE_Čehu pr.	0.018743	0.018743

Importing all libraries¶

Data exploration¶

Handling missing values¶

Missing geo coordinates¶

Missing districts¶

Invalid or missing Rooms¶

Features Engineering¶

Data Visulization¶

Data Preprocessing¶

Task 1: coding categorical data¶

Task 2 : Removing features¶

Normalising Price Values¶

Tarin Test Split¶

Model building¶

Step 1: add x0 =1 to dataset¶

Step 2 : build model¶

Step 3 : The parameters for linear regression model¶

Model Evaluation¶

Model Validation¶

	op_type	district	street	rooms	area	floor	total_floors	house_seria	house_type	condition	price	lat	lon
0	For rent	Purvciems	Dzelzavas 93	1	27.0	3.0	5.0	LT proj.	Brick-Panel	All amenities	250.0	56.955484	24.202139
1	For rent	Ziepniekkalns	Ozolciema 32	2	50.0	7.0	9.0	602.	Panel	All amenities	300.0	56.899798	24.098350
2	For sale	centrs	Ieroču 6	2	33.0	1.0	2.0	P. kara	Wood	Partial amenities	13500.0	56.975092	24.139884
3	For sale	Ķengarags	Maskavas 305	2	50.0	5.0	5.0	LT proj.	Panel	All amenities	34000.0	56.859788	24.308342
4	For rent	Teika	Zemgala gatve 80	3	76.0	3.0	12.0	Jaun.	Masonry	All amenities	490.0	56.977745	24.166042

	missing_count	missing_share
lat	205	0.049303
lon	205	0.049303
district	2	0.000481
rooms	1	0.000241
op_type	0	0.000000
street	0	0.000000
area	0	0.000000
floor	0	0.000000
total_floors	0	0.000000
house_seria	0	0.000000
house_type	0	0.000000
condition	0	0.000000
price	0	0.000000

	op_type	district	street	rooms	area	floor	total_floors	house_seria	house_type	condition	price	lat	lon
1197	For sale	NaN	Ogļu 32	3	79.0	1.0	4.0	Renov.	Masonry	All amenities	260000.0	56.960841	24.081765
3577	For sale	NaN	Pupuku iela 9	2	55.0	1.0	5.0	Jaun.	Panel	All amenities	90000.0	56.905114	24.143239

	op_type	district	street	rooms	area	floor	total_floors	house_seria	house_type	condition	price	lat	lon
319	For rent	Ķīpsala	Ogļu 32	3	80.0	1.0	4.0	Renov.	Masonry	All amenities	1200.0	56.960841	24.081765
320	For sale	Ķīpsala	Ogļu 30	3	104.0	2.0	2.0	Jaun.	Masonry	All amenities	320000.0	56.960680	24.081836
1185	For sale	Ķīpsala	Ogļu 30	3	100.0	1.0	3.0	Jaun.	Masonry	All amenities	300000.0	56.960680	24.081836
1186	For sale	Ķīpsala	Ogļu 32	3	130.0	4.0	4.0	Renov.	Masonry	All amenities	450000.0	56.960841	24.081765
1193	For sale	Ķīpsala	Ogļu 30	3	140.0	3.0	3.0	Jaun.	Masonry	All amenities	350000.0	56.960680	24.081836
1197	For sale	NaN	Ogļu 32	3	79.0	1.0	4.0	Renov.	Masonry	All amenities	260000.0	56.960841	24.081765
1198	For sale	Ķīpsala	Ogļu 30	3	99.0	2.0	3.0	Jaun.	Masonry	All amenities	330000.0	56.960680	24.081836
3824	For sale	Ķīpsala	Ogļu 32	3	80.0	2.0	4.0	Renov.	Masonry	All amenities	260000.0	56.960841	24.081765
3825	For rent	Ķīpsala	Ogļu 32	3	80.0	2.0	4.0	Renov.	Masonry	All amenities	1200.0	56.960841	24.081765
3826	For sale	Ķīpsala	Ogļu 32	3	79.0	1.0	4.0	P. kara	Masonry	All amenities	265000.0	56.960841	24.081765
3827	For sale	Ķīpsala	Ogļu 32	3	79.0	1.0	4.0	P. kara	Masonry	All amenities	260000.0	56.960841	24.081765

	op_type	district	street	rooms	area	floor	total_floors	house_seria	house_type	condition	price	lat	lon
79	For rent	Sarkandaugava	Tilta 5	None	76.0	2.0	4.0	P. kara	Masonry	All amenities	295.0	56.995073	24.127396
315	For rent	centrs	Brīvības 103	None	300.0	4.0	5.0	Specpr.	Masonry	All amenities	350.0	56.961433	24.133111
384	For sale	centrs	Blaumaņa 9	None	287.0	5.0	6.0	Renov.	Brick	All amenities	298544.0	56.953926	24.123761
535	For sale	Vecrīga	Kungu 25	None	330.0	5.0	6.0	Specpr.	Panel	All amenities	474990.0	56.945338	24.109464
605	For sale	centrs	Blaumaņa 36	None	190.0	4.0	5.0	P. kara	Brick	All amenities	350000.0	56.951729	24.125815
884	For sale	Āgenskalns	M. Nometņu 85	None	207.0	2.0	2.0	P. kara	Wood	All amenities	79000.0	56.937791	24.055589
1579	For sale	centrs	Hospitāļu 40	None	189.0	3.0	4.0	Staļina	Masonry	All amenities	240000.0	56.975886	24.139173
1769	For sale	centrs	Dārzaugļu 1	NaN	214.0	5.0	5.0	Renov.	Brick-Panel	All amenities	165000.0	56.965284	24.151135
2010	For sale	centrs	Laboratorijas 1	None	115.4	2.0	2.0	P. kara	Wood	Partial amenities	50000.0	56.958793	24.149118
2039	For sale	Vecmīlgrāvis	Emmas 12a	None	188.0	1.0	3.0	Staļina	Masonry	All amenities	49000.0	57.029079	24.111099
2212	For sale	centrs	Birznieka-Upīša 6	None	195.0	2.0	3.0	P. kara	Wood	All amenities	159000.0	56.948547	24.124454
2622	For sale	Sarkandaugava	Aptiekas 8	None	427.0	1.0	5.0	Staļina	Masonry	All amenities	340000.0	56.996935	24.123366
3460	For rent	centrs	Matīsa 42	None	20.0	5.0	5.0	P. kara	Brick	All amenities	135.0	56.955188	24.137016
3511	For sale	centrs	Stabu 41	None	149.0	2.0	5.0	P. kara	Masonry	All amenities	210000.0	56.955441	24.130648
3662	For sale	centrs	Katrīnas d. 17	None	169.0	2.0	6.0	Jaun.	Panel-Brick	All amenities	180000.0	56.971682	24.102195

	rooms	area	floor	condition	district_code	OHE_Brick-Panel	OHE_Masonry	OHE_Panel	OHE_Panel-Brick	OHE_Wood	...	OHE_Hrušč.	OHE_Jaun.	OHE_LT proj.	OHE_M. ģim.	OHE_P. kara	OHE_Priv. m.	OHE_Renov.	OHE_Specpr.	OHE_Staļina	OHE_Čehu pr.
2	2	33.0	1.0	2	41	0	0	0	0	1	...	0	0	0	0	1	0	0	0	0	0
3	2	50.0	5.0	3	44	0	0	1	0	0	...	0	0	1	0	0	0	0	0	0	0
6	2	37.0	5.0	3	44	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
10	2	46.0	4.0	3	21	0	0	1	0	0	...	1	0	0	0	0	0	0	0	0	0
11	3	58.0	4.0	3	15	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4679	4	145.0	3.0	3	41	0	1	0	0	0	...	0	0	0	0	1	0	0	0	0	0
4683	1	34.0	3.0	2	41	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
4686	2	43.0	5.0	2	26	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
4687	1	18.0	1.0	3	43	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
4688	3	78.0	4.0	3	41	0	1	0	0	0	...	0	1	0	0	0	0	0	0	0	0