Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 12 CHEAT SHEET: Further Topics in Multiple Regression
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # OLS regression with R-style formulas
# !pip install pyfixest # uncomment if running in Google Colab
import statsmodels.api as sm # prediction tools (get_prediction)
from scipy import stats # statistical distributions for CIs and power
from statsmodels.graphics.tsaplots import plot_acf # autocorrelation plots
# =============================================================================
# STEP 1: Load house price data from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files directly from the web
url_house = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house = pd.read_stata(url_house)
print(f"Dataset: {data_house.shape[0]} observations, {data_house.shape[1]} variables")
print(data_house[['price', 'size', 'bedrooms', 'bathrooms', 'lotsize', 'age']].describe().round(2))
# =============================================================================
# STEP 2: OLS regression with default standard errors
# =============================================================================
# Default SEs assume homoskedasticity (constant error variance)
fit_default = pf.feols('price ~ size + bedrooms + bathrooms + lotsize + age + monthsold',
data=data_house)
print(f"\nR-squared: {fit_default._r2:.4f}")
print(f"Size effect (default): ${fit_default.coef()['size']:,.2f} per sq ft")
fit_default.summary()
# =============================================================================
# STEP 3: Robust (HC1) standard errors — compare with default
# =============================================================================
# HC1 SEs correct for heteroskedasticity without changing coefficient estimates
# Only the SEs, t-statistics, and confidence intervals change
fit_robust = pf.feols('price ~ size + bedrooms + bathrooms + lotsize + age + monthsold',
data=data_house, vcov='HC1')
# SE ratio: how much heteroskedasticity affects each coefficient's uncertainty
print(f"{'Variable':<14} {'Coef':>10} {'Default SE':>12} {'Robust SE':>12} {'Ratio':>8}")
print("-" * 58)
for var in fit_default.coef().index:
coef = fit_default.coef()[var]
se_d = fit_default.se()[var]
se_r = fit_robust.se()[var]
ratio = se_r / se_d # ratio > 1 → default was too optimistic
print(f"{var:<14} {coef:>10.2f} {se_d:>12.2f} {se_r:>12.2f} {ratio:>8.3f}")
# =============================================================================
# STEP 4: Prediction — conditional mean CI vs. individual forecast PI
# =============================================================================
# Predicting E[y|x*] is more precise than predicting an individual y|x*
fit_simple = pf.feols('price ~ size', data=data_house)
# Predict for a 2000 sq ft house — use statsmodels get_prediction for CI/PI
# pyfixest provides point predictions; statsmodels computes interval bands
sm_simple = sm.OLS.from_formula('price ~ size', data=data_house).fit()
new_house = pd.DataFrame({'size': [2000]})
pred = sm_simple.get_prediction(new_house)
pred_frame = pred.summary_frame(alpha=0.05)
s_e = np.sqrt(np.sum(fit_simple._u_hat**2) / fit_simple._N) # RMSE
print(f"\nPredicted price (2000 sq ft): ${pred_frame['mean'].values[0]:,.0f}")
print(f"95% CI for E[Y|X=2000]: [${pred_frame['mean_ci_lower'].values[0]:,.0f}, "
f"${pred_frame['mean_ci_upper'].values[0]:,.0f}]")
print(f"95% PI for individual Y: [${pred_frame['obs_ci_lower'].values[0]:,.0f}, "
f"${pred_frame['obs_ci_upper'].values[0]:,.0f}]")
print(f"\nRMSE (s_e): ${s_e:,.0f} — PI can never be narrower than ±1.96 × s_e")
# Visualize: CI (narrow) vs. PI (wide)
sizes_sorted = data_house[['size']].sort_values('size')
pred_all = sm_simple.get_prediction(sizes_sorted)
pf_pred = pred_all.summary_frame(alpha=0.05)
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_house['size'], data_house['price'], s=50, alpha=0.6, label='Actual')
ax.plot(sizes_sorted['size'], pf_pred['mean'], color='red', linewidth=2, label='Fitted line')
ax.fill_between(sizes_sorted['size'], pf_pred['mean_ci_lower'], pf_pred['mean_ci_upper'],
color='red', alpha=0.2, label='95% CI (conditional mean)')
ax.fill_between(sizes_sorted['size'], pf_pred['obs_ci_lower'], pf_pred['obs_ci_upper'],
color='blue', alpha=0.1, label='95% PI (individual)')
ax.set_xlabel('House Size (sq ft)')
ax.set_ylabel('House Price ($)')
ax.set_title('Confidence Interval vs. Prediction Interval')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 5: HAC (Newey-West) standard errors for time series
# =============================================================================
# Time series errors are often autocorrelated — HAC SEs account for this
url_gdp = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_REALGDPPC.DTA"
data_gdp = pd.read_stata(url_gdp)
mean_growth = data_gdp['growth'].mean()
n_gdp = len(data_gdp)
lag_length = int(0.75 * n_gdp**(1/3)) # rule of thumb: m = 0.75 × T^(1/3)
# Compare default SE vs. HAC SE for the mean
se_default = data_gdp['growth'].std() / np.sqrt(n_gdp)
# pyfixest NW requires a time_id column — create sequential index
data_gdp['_time'] = range(len(data_gdp))
data_gdp['_one'] = 1 # intercept-only: regress growth on constant
fit_hac = pf.feols('growth ~ 1', data=data_gdp,
vcov='NW', vcov_kwargs={'time_id': '_time', 'lag': lag_length})
print(f"\nGDP Growth: mean = {mean_growth:.4f}")
print(f"Default SE (no autocorrelation): {se_default:.6f}")
print(f"HAC SE (lag = {lag_length}): {fit_hac.se()['Intercept']:.6f}")
print(f"Ratio HAC/Default: {fit_hac.se()['Intercept'] / se_default:.3f}")
# Correlogram — visualize autocorrelation structure
fig, ax = plt.subplots(figsize=(10, 6))
plot_acf(data_gdp['growth'], lags=10, ax=ax, alpha=0.05)
ax.set_xlabel('Lag')
ax.set_ylabel('Autocorrelation')
ax.set_title('Correlogram of GDP Growth')
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Power curve — Type I vs. Type II error tradeoff
# =============================================================================
# Power = P(reject H₀ | H₀ is false) — depends on true effect size
alpha = 0.05
se_test = 15 # hypothetical standard error
n_test = 30
t_crit = stats.t.ppf(1 - alpha / 2, df=n_test - 1)
# Power as a function of the true coefficient
beta_range = np.linspace(-60, 60, 500)
power = []
for beta_true in beta_range:
ncp = beta_true / se_test # non-centrality parameter
# Two-sided test: reject if |t| > t_crit
power_val = 1 - (stats.t.cdf(t_crit - ncp, df=n_test - 1)
- stats.t.cdf(-t_crit - ncp, df=n_test - 1))
power.append(power_val)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(beta_range, power, linewidth=2)
ax.axhline(0.05, color='gray', linestyle='--', alpha=0.5, label='α = 0.05 (Type I)')
ax.axhline(0.80, color='green', linestyle='--', alpha=0.5, label='Power = 0.80 target')
ax.axvline(0, color='gray', linestyle='-', alpha=0.3, label='H₀: β = 0')
ax.set_xlabel('True Coefficient Value (β)')
ax.set_ylabel('Power = P(Reject H₀)')
ax.set_title('Power Curve: Larger Effects Are Easier to Detect')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 7: Bootstrap confidence intervals — no distributional assumptions
# =============================================================================
# Resample data with replacement to estimate the sampling distribution of β
np.random.seed(42)
n_boot = 1000
boot_slopes = []
for _ in range(n_boot):
boot_sample = data_house.sample(n=len(data_house), replace=True)
boot_fit = pf.feols('price ~ size', data=boot_sample)
boot_slopes.append(boot_fit.coef()['size'])
# Percentile method: 95% CI = [2.5th, 97.5th percentile]
ci_lower = np.percentile(boot_slopes, 2.5)
ci_upper = np.percentile(boot_slopes, 97.5)
ols_slope = fit_simple.coef()['size']
print(f"\nOLS slope (size): {ols_slope:.4f}")
print(f"Bootstrap 95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
print(f"Analytical 95% CI: [{fit_simple.confint().loc['size'][0]:.4f}, "
f"{fit_simple.confint().loc['size'][1]:.4f}]")
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(boot_slopes, bins=40, edgecolor='black', alpha=0.7)
ax.axvline(ols_slope, color='red', linewidth=2, label=f'OLS estimate: {ols_slope:.4f}')
ax.axvline(ci_lower, color='green', linewidth=2, linestyle='--', label=f'2.5th pctl: {ci_lower:.4f}')
ax.axvline(ci_upper, color='green', linewidth=2, linestyle='--', label=f'97.5th pctl: {ci_upper:.4f}')
ax.set_xlabel('Bootstrap Slope Estimates')
ax.set_ylabel('Frequency')
ax.set_title('Bootstrap Distribution of Size Coefficient (1000 replications)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 12 CHEAT SHEET: Further Topics in Multiple Regression
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load house price data from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
summarize price size bedrooms bathrooms lotsize age
* =============================================================================
* STEP 2: OLS regression with default standard errors
* =============================================================================
* Default SEs assume homoskedasticity (constant error variance)
regress price size bedrooms bathrooms lotsize age monthsold
// After regress, Stata stores results you can reference:
display "R-squared: " e(r2)
display "Size effect: " _b[size] // _b[varname] retrieves coefficients
* =============================================================================
* STEP 3: Robust (HC1) standard errors — compare with default
* =============================================================================
* Adding , robust to regress requests HC1 heteroskedasticity-robust SEs
* Coefficient estimates stay identical; only SEs, t-stats, CIs change
regress price size bedrooms bathrooms lotsize age monthsold, robust
// estimates store saves results so you can compare models later
estimates store robust_model
// Re-run default for side-by-side comparison
quietly regress price size bedrooms bathrooms lotsize age monthsold
estimates store default_model
// estimates table displays both sets of results side by side
estimates table default_model robust_model, se stats(r2 N)
* =============================================================================
* STEP 4: Prediction — conditional mean CI vs. individual forecast PI
* =============================================================================
* Predicting E[y|x*] is more precise than predicting an individual y|x*
regress price size
// predict generates fitted values for every observation
predict yhat, xb // xb = linear prediction (fitted value)
// For a specific prediction at size = 2000:
// margins computes predicted values at specified covariate values
margins, at(size = 2000) // predicted E[Y|X=2000] with 95% CI
// For the prediction interval (individual), we need RMSE
display "RMSE (s_e): " e(rmse) // irreducible individual uncertainty
// Visualize fitted line with CI bands
// lfitci plots the linear fit with confidence interval
twoway (scatter price size, msize(small) mcolor(gs10)) ///
(lfitci price size, level(95) fcolor(red%20) lcolor(red)), ///
xtitle("House Size (sq ft)") ytitle("House Price ($)") ///
title("Confidence Interval for the Conditional Mean") ///
legend(order(1 "Actual" 3 "Fitted line" 2 "95% CI"))
* =============================================================================
* STEP 5: HAC (Newey-West) standard errors for time series
* =============================================================================
* Time series errors are often autocorrelated — HAC SEs account for this
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_REALGDPPC.DTA", clear
summarize growth
display "Mean GDP growth: " r(mean)
// tsset declares the time-series structure
tsset daten
// newey estimates regression with Newey-West HAC standard errors
// lag(4) sets the maximum lag for the autocorrelation correction
newey growth, lag(4)
display "HAC SE: " _se[_cons]
// Compare with default OLS SE
regress growth
display "Default SE: " _se[_cons]
// Correlogram — visualize autocorrelation structure
// ac draws the autocorrelation function plot
ac growth, lags(10) level(95) ///
ytitle("Autocorrelation") ///
title("Correlogram of GDP Growth")
* =============================================================================
* STEP 6: Power curve — Type I vs. Type II error tradeoff
* =============================================================================
* Power = P(reject H0 | H0 is false) — depends on true effect size
* Stata's power command computes power for common tests
// power twomeans: power for testing whether a mean differs from a value
// power onettest: power for a one-sample t-test scenario
// Here we compute power manually for a range of effect sizes:
clear
set obs 500
gen beta_true = -60 + (_n - 1) * 240 / 499 // range -60 to 60
local se = 15 // hypothetical standard error
local n = 30 // sample size
local df = `n' - 1
local t_crit = invttail(`df', 0.025) // two-sided critical value at alpha=0.05
// Non-centrality parameter and power
gen ncp = beta_true / `se'
gen power = 1 - (t(`df', `t_crit' - ncp) - t(`df', -`t_crit' - ncp))
twoway (line power beta_true, lwidth(medthick) lcolor(blue)) ///
(function y = 0.05, range(-60 60) lpattern(dash) lcolor(gs8)) ///
(function y = 0.80, range(-60 60) lpattern(dash) lcolor(green)), ///
xtitle("True Coefficient Value ({β)}") ///
ytitle("Power = P(Reject H{sub:0})") ///
title("Power Curve: Larger Effects Are Easier to Detect") ///
legend(order(1 "Power" 2 "{α} = 0.05" 3 "Target = 0.80"))
* =============================================================================
* STEP 7: Bootstrap confidence intervals — no distributional assumptions
* =============================================================================
* Resample data with replacement to estimate the sampling distribution of beta
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
// bootstrap prefix resamples data and re-estimates the model each time
// reps(1000) = 1000 bootstrap replications; seed(42) for reproducibility
bootstrap _b[size], reps(1000) seed(42): regress price size
// estat bootstrap displays percentile-method CIs alongside normal CIs
estat bootstrap, percentile
// Compare with analytical 95% CI from OLS
regress price size
display "OLS slope: " _b[size]
display "Analytical 95% CI: [" _b[size] - invttail(e(df_r), 0.025)*_se[size] ///
", " _b[size] + invttail(e(df_r), 0.025)*_se[size] "]"
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 12 CHEAT SHEET: Further Topics in Multiple Regression
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load house price data from a URL
# =============================================================================
url_house <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house <- read_dta(url_house)
cat("Dataset:", nrow(data_house), "observations,", ncol(data_house), "variables\n")
# =============================================================================
# STEP 2: OLS regression with default standard errors
# =============================================================================
model_default <- feols(price ~ size + bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house)
summary(model_default)
# =============================================================================
# STEP 3: Robust (HC1) standard errors — compare with default
# =============================================================================
model_robust <- feols(price ~ size + bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house, vcov = "HC1")
etable(model_default, model_robust,
headers = c("Default", "Robust (HC1)"))
# =============================================================================
# STEP 4: Prediction — conditional mean CI vs. individual forecast PI
# =============================================================================
model_simple <- feols(price ~ size, data = data_house)
# Use base R's lm() for predict() with interval support
model_lm <- lm(price ~ size, data = data_house)
new_house <- data.frame(size = 2000)
ci_pred <- predict(model_lm, new_house, interval = "confidence", level = 0.95)
pi_pred <- predict(model_lm, new_house, interval = "prediction", level = 0.95)
cat("\nPredicted price (2000 sq ft): $", round(ci_pred[1], 0), "\n")
cat("95% CI for E[Y|X=2000]: [$", round(ci_pred[2], 0), ", $", round(ci_pred[3], 0), "]\n")
cat("95% PI for individual Y: [$", round(pi_pred[2], 0), ", $", round(pi_pred[3], 0), "]\n")
cat("RMSE:", round(sigma(model_lm), 0), "\n")
# =============================================================================
# STEP 5: HAC (Newey-West) standard errors for time series
# =============================================================================
url_gdp <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_REALGDPPC.DTA"
data_gdp <- read_dta(url_gdp)
# fixest supports Newey-West via vcov = NW(L) ~ time_var
model_hac <- feols(growth ~ 1, data = data_gdp, vcov = NW(4) ~ daten)
model_ols <- feols(growth ~ 1, data = data_gdp)
cat("\nGDP Growth: mean =", round(mean(data_gdp$growth), 4), "\n")
cat("Default SE:", round(se(model_ols)["(Intercept)"], 6), "\n")
cat("HAC SE: ", round(se(model_hac)["(Intercept)"], 6), "\n")
# Correlogram — visualize autocorrelation structure
acf(data_gdp$growth, lag.max = 10, main = "Correlogram of GDP Growth")
# =============================================================================
# STEP 6: Power curve — Type I vs. Type II error tradeoff
# =============================================================================
alpha <- 0.05
se_test <- 15
n_test <- 30
t_crit <- qt(1 - alpha/2, df = n_test - 1)
beta_range <- seq(-60, 60, length.out = 500)
ncp <- beta_range / se_test
power <- 1 - (pt(t_crit - ncp, df = n_test - 1) -
pt(-t_crit - ncp, df = n_test - 1))
ggplot(data.frame(beta = beta_range, power = power), aes(x = beta, y = power)) +
geom_line(linewidth = 1.2, color = "blue") +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "gray") +
geom_hline(yintercept = 0.80, linetype = "dashed", color = "green") +
labs(x = "True Coefficient Value", y = "Power = P(Reject H0)",
title = "Power Curve: Larger Effects Are Easier to Detect") +
theme_minimal()
# =============================================================================
# STEP 7: Bootstrap confidence intervals — no distributional assumptions
# =============================================================================
set.seed(42)
n_boot <- 1000
boot_slopes <- replicate(n_boot, {
idx <- sample(nrow(data_house), replace = TRUE)
coef(feols(price ~ size, data = data_house[idx, ]))["size"]
})
# Percentile method: 95% CI = [2.5th, 97.5th percentile]
ci_boot <- quantile(boot_slopes, probs = c(0.025, 0.975))
cat("\nOLS slope (size):", round(coef(model_simple)["size"], 4), "\n")
cat("Bootstrap 95% CI: [", round(ci_boot[1], 4), ",", round(ci_boot[2], 4), "]\n")
cat("Analytical 95% CI: [", round(confint(model_simple)["size", 1], 4), ",",
round(confint(model_simple)["size", 2], 4), "]\n")
hist(boot_slopes, breaks = 40, col = "steelblue", border = "white",
main = "Bootstrap Distribution of Size Coefficient",
xlab = "Bootstrap Slope Estimates")
abline(v = coef(model_simple)["size"], col = "red", lwd = 2)
abline(v = ci_boot, col = "green", lwd = 2, lty = 2)
Paste into your R console or RStudio