Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 10 CHEAT SHEET: Data Summary for Multiple Regression
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import seaborn as sns # statistical visualization (heatmaps, pairplots)
import pyfixest as pf # OLS regression with R-style formulas
# !pip install pyfixest # uncomment if running in Google Colab
from statsmodels.stats.outliers_influence import variance_inflation_factor # multicollinearity detection
# =============================================================================
# STEP 1: Load data and explore
# =============================================================================
# pd.read_stata() reads Stata .dta files directly from a URL
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house = pd.read_stata(url)
print(f"Dataset: {data_house.shape[0]} observations, {data_house.shape[1]} variables")
print(data_house[['price', 'size', 'bedrooms', 'bathrooms', 'lotsize', 'age']].describe().round(2))
# =============================================================================
# STEP 2: Partial effects vs. total effects — why controls matter
# =============================================================================
# Bivariate regression captures TOTAL effect (direct + indirect through size)
fit_bivariate = pf.feols('price ~ bedrooms', data=data_house)
# Multiple regression isolates the PARTIAL effect (holding size constant)
fit_partial = pf.feols('price ~ bedrooms + size', data=data_house)
print(f"Bedrooms coefficient (bivariate): ${fit_bivariate.coef()['bedrooms']:,.2f}")
print(f"Bedrooms coefficient (multiple): ${fit_partial.coef()['bedrooms']:,.2f}")
print(f"Change: ${fit_partial.coef()['bedrooms'] - fit_bivariate.coef()['bedrooms']:,.2f}")
# The coefficient drops from ~$23,667 to ~$1,553 once we control for size
# =============================================================================
# STEP 3: Correlation matrix — check pairwise associations before modeling
# =============================================================================
# High correlations between regressors signal potential multicollinearity
corr_vars = ['price', 'size', 'bedrooms', 'bathrooms', 'lotsize', 'age']
corr_matrix = data_house[corr_vars].corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
square=True, linewidths=1)
ax.set_title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()
print(f"Price-Size correlation: {corr_matrix.loc['price', 'size']:.3f}")
print(f"Size-Bedrooms correlation: {corr_matrix.loc['size', 'bedrooms']:.3f}")
# =============================================================================
# STEP 4: Full multiple regression — estimate partial effects
# =============================================================================
# Each coefficient measures the change in price for a one-unit change in that
# variable, holding ALL other regressors constant
fit_full = pf.feols('price ~ size + bedrooms + bathrooms + lotsize + age + monthsold',
data=data_house)
size_coef = fit_full.coef()['size']
print(f"Size effect: each additional sq ft is associated with ${size_coef:,.2f} higher price")
print(f"R-squared: {fit_full._r2:.4f} ({fit_full._r2*100:.1f}% of variation explained)")
print(f"Adjusted R-squared: {fit_full._adj_r2:.4f}")
# Full regression table (coefficients, std errors, t-stats, p-values)
fit_full.summary()
# =============================================================================
# STEP 5: FWL theorem — how "holding constant" actually works
# =============================================================================
# Step A: Regress size on all other regressors, keep residuals
fit_size_on_others = pf.feols('size ~ bedrooms + bathrooms + lotsize + age + monthsold',
data=data_house)
resid_size = fit_size_on_others._u_hat
# Step B: Regress price on those residuals — the slope matches the full model
data_house['resid_size'] = resid_size
fit_fwl = pf.feols('price ~ resid_size', data=data_house)
print(f"Size coef from FULL regression: {fit_full.coef()['size']:.10f}")
print(f"Coef from FWL residual regression: {fit_fwl.coef()['resid_size']:.10f}")
# These are identical — the FWL theorem in action
# =============================================================================
# STEP 6: Model comparison — parsimony vs. complexity
# =============================================================================
# Compare simple (size only) vs. full model using fit statistics
fit_simple = pf.feols('price ~ size', data=data_house)
# AIC/BIC computed manually: AIC = n*ln(RSS/n) + 2k, BIC = n*ln(RSS/n) + k*ln(n)
def aic_bic(fit):
n = fit._N
k = len(fit.coef())
rss = np.sum(fit._u_hat**2)
aic = n * np.log(rss / n) + 2 * k
bic = n * np.log(rss / n) + k * np.log(n)
return aic, bic
comparison = pd.DataFrame({
'Model': ['Size only', 'Full (all variables)'],
'R\u00b2': [fit_simple._r2, fit_full._r2],
'Adj R\u00b2': [fit_simple._adj_r2, fit_full._adj_r2],
'AIC': [aic_bic(fit_simple)[0], aic_bic(fit_full)[0]],
'BIC': [aic_bic(fit_simple)[1], aic_bic(fit_full)[1]],
})
print(comparison.to_string(index=False))
# Adj R\u00b2 DECREASES when adding 5 weak predictors — parsimony wins
# =============================================================================
# STEP 7: VIF — detect multicollinearity
# =============================================================================
# VIF_j = 1 / (1 - R\u00b2_j), where R\u00b2_j is from regressing x_j on all other x's
# VIF > 5: moderate concern; VIF > 10: severe multicollinearity
X = data_house[['size', 'bedrooms', 'bathrooms', 'lotsize', 'age', 'monthsold']]
vif_data = pd.DataFrame({
'Variable': X.columns,
'VIF': [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})
print(vif_data.to_string(index=False))
# =============================================================================
# STEP 8: Diagnostics — actual vs. predicted and residual plots
# =============================================================================
# Good fit: points hug the 45\u00b0 line; residuals scatter randomly around zero
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Actual vs. predicted
axes[0].scatter(data_house['price'], fit_full.predict(), alpha=0.7, s=50)
axes[0].plot([data_house['price'].min(), data_house['price'].max()],
[data_house['price'].min(), data_house['price'].max()],
'r--', linewidth=2, label='Perfect prediction (45\u00b0 line)')
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title('Actual vs Predicted')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Residual plot
axes[1].scatter(fit_full.predict(), fit_full._u_hat, alpha=0.7, s=50)
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Fitted Values ($)')
axes[1].set_ylabel('Residuals ($)')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 10 CHEAT SHEET: Data Summary for Multiple Regression
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data and explore
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
summarize price size bedrooms bathrooms lotsize age
* =============================================================================
* STEP 2: Partial effects vs. total effects — why controls matter
* =============================================================================
* Bivariate regression captures TOTAL effect (direct + indirect through size)
regress price bedrooms
estimates store bivariate // save results for comparison
* Multiple regression isolates the PARTIAL effect (holding size constant)
regress price bedrooms size
estimates store partial // save results for comparison
* Compare: the bedrooms coefficient drops from ~$23,667 to ~$1,553
estimates table bivariate partial, b(%9.2f) se(%9.2f) stats(r2 r2_a N)
* =============================================================================
* STEP 3: Correlation matrix — check pairwise associations before modeling
* =============================================================================
* High correlations between regressors signal potential multicollinearity
* correlate shows the full pairwise correlation matrix
correlate price size bedrooms bathrooms lotsize age
* pwcorr adds significance stars and p-values
pwcorr price size bedrooms bathrooms lotsize age, star(0.05) sig
* =============================================================================
* STEP 4: Full multiple regression — estimate partial effects
* =============================================================================
* Each coefficient measures the change in price for a one-unit change in that
* variable, holding ALL other regressors constant
regress price size bedrooms bathrooms lotsize age monthsold
estimates store full // save full model results
display "Size effect: each additional sq ft is associated with $" _b[size] " higher price"
display "R-squared: " e(r2)
display "Adjusted R-squared: " e(r2_a)
* =============================================================================
* STEP 5: FWL theorem — how "holding constant" actually works
* =============================================================================
* Step A: Regress size on all other regressors, keep residuals
regress size bedrooms bathrooms lotsize age monthsold
predict resid_size, residuals // residuals = variation in size unexplained by controls
* Step B: Regress price on those residuals — the slope matches the full model
regress price resid_size
* Compare: this coefficient matches the size coefficient from the full model
display "FWL slope: " _b[resid_size]
estimates restore full
display "Full regression coef: " _b[size]
// These are identical — the FWL theorem in action
* =============================================================================
* STEP 6: Model comparison — parsimony vs. complexity
* =============================================================================
* Compare simple (size only) vs. full model using fit statistics
* Simple model
quietly regress price size
estimates store simple
estat ic // AIC and BIC
* Full model
quietly regress price size bedrooms bathrooms lotsize age monthsold
estimates store full
estat ic // AIC and BIC
* Side-by-side comparison: R², adj R², coefficients
estimates table simple full, b(%9.2f) se(%9.2f) stats(r2 r2_a N)
// Adj R² DECREASES when adding 5 weak predictors — parsimony wins
* =============================================================================
* STEP 7: VIF — detect multicollinearity
* =============================================================================
* VIF_j = 1 / (1 - R²_j), where R²_j is from regressing x_j on all other x's
* VIF > 5: moderate concern; VIF > 10: severe multicollinearity
quietly regress price size bedrooms bathrooms lotsize age monthsold
estat vif // VIF for each regressor after the last regression
* =============================================================================
* STEP 8: Diagnostics — actual vs. predicted and residual plots
* =============================================================================
* Good fit: points hug the 45° line; residuals scatter randomly around zero
quietly regress price size bedrooms bathrooms lotsize age monthsold
predict yhat, xb // predicted values
predict resid, residuals // residuals
* Actual vs. predicted plot
twoway (scatter price yhat, mcolor(navy) msymbol(circle)) ///
(function y=x, range(yhat) lcolor(red) lpattern(dash) lwidth(medthick)), ///
xtitle("Predicted Price ($)") ytitle("Actual Price ($)") ///
title("Actual vs Predicted") ///
legend(order(1 "Observations" 2 "45° line"))
* Residual plot
twoway (scatter resid yhat, mcolor(navy) msymbol(circle)) ///
(function y=0, range(yhat) lcolor(red) lpattern(dash) lwidth(medthick)), ///
xtitle("Fitted Values ($)") ytitle("Residuals ($)") ///
title("Residual Plot") ///
legend(order(1 "Residuals" 2 "Zero line"))
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 10 CHEAT SHEET: Data Summary for Multiple Regression
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
library(car) # vif() for multicollinearity detection
# =============================================================================
# STEP 1: Load data and explore
# =============================================================================
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house <- read_dta(url)
cat("Dataset:", nrow(data_house), "observations,", ncol(data_house), "variables\n")
summary(data_house[, c("price", "size", "bedrooms", "bathrooms", "lotsize", "age")])
# =============================================================================
# STEP 2: Partial effects vs. total effects — why controls matter
# =============================================================================
# Bivariate regression captures TOTAL effect (direct + indirect through size)
model_bivariate <- feols(price ~ bedrooms, data = data_house)
# Multiple regression isolates the PARTIAL effect (holding size constant)
model_partial <- feols(price ~ bedrooms + size, data = data_house)
cat("Bedrooms coefficient (bivariate): $",
round(coef(model_bivariate)["bedrooms"], 2), "\n")
cat("Bedrooms coefficient (multiple): $",
round(coef(model_partial)["bedrooms"], 2), "\n")
# =============================================================================
# STEP 3: Correlation matrix — check pairwise associations before modeling
# =============================================================================
corr_vars <- c("price", "size", "bedrooms", "bathrooms", "lotsize", "age")
corr_matrix <- cor(data_house[, corr_vars])
round(corr_matrix, 3)
# =============================================================================
# STEP 4: Full multiple regression — estimate partial effects
# =============================================================================
model_full <- feols(price ~ size + bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house)
summary(model_full)
cat("Size effect: each additional sq ft → $",
round(coef(model_full)["size"], 2), "higher price\n")
cat("R-squared:", round(r2(model_full), 4), "\n")
cat("Adjusted R-squared:", round(r2(model_full, "ar2"), 4), "\n")
# =============================================================================
# STEP 5: FWL theorem — how "holding constant" actually works
# =============================================================================
# Step A: Regress size on all other regressors, keep residuals
model_size_on_others <- feols(size ~ bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house)
resid_size <- residuals(model_size_on_others)
# Step B: Regress price on those residuals — the slope matches the full model
model_fwl <- feols(price ~ resid_size, data = data.frame(
price = data_house$price, resid_size = resid_size))
cat("Size coef from FULL regression: ", coef(model_full)["size"], "\n")
cat("Coef from FWL residual regression: ", coef(model_fwl)["resid_size"], "\n")
# These are identical — the FWL theorem in action
# =============================================================================
# STEP 6: Model comparison — parsimony vs. complexity
# =============================================================================
model_simple <- feols(price ~ size, data = data_house)
# etable() from fixest compares models side by side
etable(model_simple, model_full,
headers = c("Size only", "Full model"),
fitstat = c("r2", "ar2", "n"))
# =============================================================================
# STEP 7: VIF — detect multicollinearity
# =============================================================================
# VIF_j = 1 / (1 - R²_j); VIF > 5: moderate concern; VIF > 10: severe
# car::vif() requires an lm object
model_lm <- lm(price ~ size + bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house)
vif(model_lm)
# =============================================================================
# STEP 8: Diagnostics — actual vs. predicted and residual plots
# =============================================================================
data_house$yhat <- fitted(model_full)
data_house$resid <- residuals(model_full)
library(patchwork)
p1 <- ggplot(data_house, aes(x = yhat, y = price)) +
geom_point(alpha = 0.7) +
geom_abline(color = "red", linetype = "dashed", linewidth = 1.2) +
labs(x = "Predicted Price ($)", y = "Actual Price ($)",
title = "Actual vs Predicted") +
theme_minimal()
p2 <- ggplot(data_house, aes(x = yhat, y = resid)) +
geom_point(alpha = 0.7) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1.2) +
labs(x = "Fitted Values ($)", y = "Residuals ($)",
title = "Residual Plot") +
theme_minimal()
p1 + p2
Paste into your R console or RStudio