FInal Assignment
FInal Assignment
ipynb - Colab
a.)
1
2 import pandas as pd
3 from scipy import stats
4 import statsmodels.formula.api as sm
5
6 # Load the data
7 df = pd.read_excel("HousePrices.xls")
8
9 # Fit the linear model
10 model = sm.ols('sell ~ lot + bdms + fb + sty + drv + rec + ffin + ghw + ca + gar + reg', data=df).fit()
11
12 # Perform the Harvey-Collier test for linearity
13 test_statistic = stats.t.ppf(0.975, df=model.df_resid) * model.resid.std() / (df.shape[0] ** 0.5)
14 p_value = 2 * (1 - stats.t.cdf(abs(test_statistic), df=model.df_resid))
15
16 print("Harvey-Collier test statistic:", test_statistic)
17 print("p-value:", p_value)
18
19 # Interpret the results
20 if p_value < 0.05:
21 print("Reject the null hypothesis of linearity.")
22 else:
23 print("Fail to reject the null hypothesis of linearity.")
24
b.) It suggests that after taking a log transformation, the data is linear
1
2 # Fit the linear model with log of sales price
3 model = sm.ols('np.log(sell) ~ lot + bdms + fb + sty + drv + rec + ffin + ghw + ca + gar + reg', data=df).fit()
4
5 # Perform the Harvey-Collier test for linearity
6 test_statistic = stats.t.ppf(0.975, df=model.df_resid) * model.resid.std() / (df.shape[0] ** 0.5)
7 p_value = 2 * (1 - stats.t.cdf(abs(test_statistic), df=model.df_resid))
8
9 print("Harvey-Collier test statistic:", test_statistic)
10 print("p-value:", p_value)
11
12 # Interpret the results
13 if p_value < 0.05:
14 print("Reject the null hypothesis of linearity.")
15 else:
16 print("Fail to reject the null hypothesis of linearity.")
17
c.)
https://colab.research.google.com/drive/14bGc65T62nRZlUHX3xrAK7JYBm-D8MqG?authuser=1#scrollTo=w5jRlxpgMRg9&printMode=true 1/9
26/07/2024, 09:30 Untitled17.ipynb - Colab
1
2 model = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + gar + reg', d
3
4 # Perform the Harvey-Collier test for linearity
5 test_statistic = stats.t.ppf(0.975, df=model.df_resid) * model.resid.std() / (df.shape[0] ** 0.5)
6 p_value = 2 * (1 - stats.t.cdf(abs(test_statistic), df=model.df_resid))
7
8 print("Harvey-Collier test statistic:", test_statistic)
9 print("p-value:", p_value)
10
11 # Interpret the results
12 if p_value < 0.05:
13 print("Reject the null hypothesis of linearity.")
14 else:
15 print("Fail to reject the null hypothesis of linearity.")
16
d.)
1
2 import numpy as np
3 # Create interaction terms
4 df['lot_bdms'] = np.log(df['lot']) * df['bdms']
5 df['lot_fb'] = np.log(df['lot']) * df['fb']
6 df['lot_sty'] = np.log(df['lot']) * df['sty']
7 df['lot_drv'] = np.log(df['lot']) * df['drv']
8 df['lot_rec'] = np.log(df['lot']) * df['rec']
9 df['lot_ffin'] = np.log(df['lot']) * df['ffin']
10 df['lot_ghw'] = np.log(df['lot']) * df['ghw']
11 df['lot_ca'] = np.log(df['lot']) * df['ca']
12 df['lot_gar'] = np.log(df['lot']) * df['gar']
13 df['lot_reg'] = np.log(df['lot']) * df['reg']
14
15 # Fit the model with interaction terms
16 model_interaction = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + g
17
18 # Count significant interaction terms
19 significant_interactions = 0
20 for i in range(13, len(model_interaction.pvalues)):
21 if model_interaction.pvalues[i] < 0.05:
22 significant_interactions += 1
23
24 print("Number of individually significant interaction terms:", significant_interactions)
25
e.)
1
2 null_model = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + gar + re
3 f_statistic = ((null_model.ssr - model_interaction.ssr) / 11) / (model_interaction.ssr / (df.shape[0] - 24))
4 p_value = 1 - stats.f.cdf(f_statistic, dfn=11, dfd=df.shape[0] - 24)
5
6 print("F-statistic:", f_statistic)
7 print("p-value:", p_value)
8
9 # Interpret the results
10 if p_value < 0.05:
11 print("Reject the null hypothesis of no joint significance of interaction effects.")
12 else:
13 print("Fail to reject the null hypothesis of no joint significance of interaction effects.")
14
F-statistic: 1.7161547474045966
p-value: 0.0666292461322634
Fail to reject the null hypothesis of no joint significance of interaction effects.
f.)
https://colab.research.google.com/drive/14bGc65T62nRZlUHX3xrAK7JYBm-D8MqG?authuser=1#scrollTo=w5jRlxpgMRg9&printMode=true 2/9
26/07/2024, 09:30 Untitled17.ipynb - Colab
1 print(model_interaction.summary())
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.69e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
https://colab.research.google.com/drive/14bGc65T62nRZlUHX3xrAK7JYBm-D8MqG?authuser=1#scrollTo=w5jRlxpgMRg9&printMode=true 3/9
26/07/2024, 09:30 Untitled17.ipynb - Colab
reg 0.1679 0.463 0.362 0.717 -0.742 1.078
lot_bdms 0.0171 0.039 0.442 0.659 -0.059 0.093
lot_fb 0.0736 0.050 1.468 0.143 -0.025 0.172
lot_sty -0.0677 0.036 -1.859 0.064 -0.139 0.004
lot_drv 0.2631 0.093 2.840 0.005 0.081 0.445
lot_rec -0.1914 0.073 -2.606 0.009 -0.336 -0.047
lot_ghw 0.0919 0.106 0.865 0.387 -0.117 0.301
lot_ca 0.0684 0.057 1.191 0.234 -0.044 0.181
lot_gar -0.0199 0.031 -0.630 0.529 -0.082 0.042
lot_reg -0.0037 0.054 -0.069 0.945 -0.110 0.102
==============================================================================
Omnibus: 7.366 Durbin-Watson: 1.517
Prob(Omnibus): 0.025 Jarque-Bera (JB): 8.495
Skew: -0.177 Prob(JB): 0.0143
Kurtosis: 3.499 Cond. No. 7.61e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.61e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.6e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.62e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
Hence thisbis the final model after removing variables with high p-values
1 #Removing lot_gar
2 model_interaction = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + g
3 print(model_interaction.summary())
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.51e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
1 # Removing lot_ghw
2 model_interaction = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + g
3 print(model_interaction.summary())
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.44e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
1 #Removing lot_ca
2 model_interaction = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + g
3 print(model_interaction.summary())
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.44e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
1 #Remvoign lot-sty
2 model_interaction = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + g
3 print(model_interaction.summary())
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.12e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
1 #Removing lot_fb
2 model_interaction = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + g
3 print(model_interaction.summary())
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.74e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
g.) this can lead to biased estimates of the coefficients of the included variables. This bias occurs because the omitted variable may be
correlated with both the dependent variable and one or more of the included explanatory variables. The sale price will be likely
overestimated
f.)
1
2 train_df = df.iloc[:400]
3 model = sm.ols('np.log(sell) ~ lot + np.log(lot) + bdms + fb + sty + drv + rec + ffin + ghw + ca + gar + reg', da
4
5 # Make predictions for the remaining 146 observations
6 test_df = df.iloc[400:]
7 predictions = model.predict(test_df)
8
9 # Calculate the mean squared prediction error
10 mse = np.mean((np.log(test_df['sell']) - predictions) ** 2)
11 print("Mean squared prediction error:", mse)
12
1 print(np.log(df["sell"]))
0 10.645425
1 10.558414
2 10.809728
3 11.010399
4 11.018629
...
541 11.424094
542 11.451050
543 11.542484
544 11.561716
545 11.561716
Name: sell, Length: 546, dtype: float64
1 #The error is pretty high considering the distriubution and variance of log of predicted prices
https://colab.research.google.com/drive/14bGc65T62nRZlUHX3xrAK7JYBm-D8MqG?authuser=1#scrollTo=w5jRlxpgMRg9&printMode=true 8/9
26/07/2024, 09:30 Untitled17.ipynb - Colab
https://colab.research.google.com/drive/14bGc65T62nRZlUHX3xrAK7JYBm-D8MqG?authuser=1#scrollTo=w5jRlxpgMRg9&printMode=true 9/9