Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 11 CHEAT SHEET: Statistical Inference for Multiple Regression
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # OLS regression with R-style formulas
# !pip install pyfixest # uncomment if running in Google Colab
from scipy import stats # t and F distributions for inference
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files; the dataset has 29 houses from Davis, CA
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house = pd.read_stata(url)
print(f"Dataset: {data_house.shape[0]} observations, {data_house.shape[1]} variables")
print(data_house[['price', 'size', 'bedrooms', 'bathrooms', 'lotsize', 'age']].describe().round(2))
# =============================================================================
# STEP 2: Estimate the full multiple regression model
# =============================================================================
# Formula syntax: 'y ~ x1 + x2 + ...' includes an intercept automatically
# pf.feols() estimates and returns the fitted model in one call
fit_full = pf.feols('price ~ size + bedrooms + bathrooms + lotsize + age + monthsold',
data=data_house)
n = int(fit_full._N)
k = len(fit_full.coef())
df = n - k # degrees of freedom for t and F tests
print(f"\nSize effect: ${fit_full.coef()['size']:,.2f} per sq ft (p = {fit_full.pvalue()['size']:.4f})")
print(f"R-squared: {fit_full._r2:.4f} ({fit_full._r2*100:.1f}% of variation explained)")
print(f"Degrees of freedom: n-k = {n}-{k} = {df}")
# Full regression table (coefficients, std errors, t-stats, p-values, R\u00b2)
fit_full.summary()
# =============================================================================
# STEP 3: Confidence intervals — does the CI exclude zero?
# =============================================================================
# 95% CI: b_j \u00b1 t_critical \u00d7 se(b_j)
# If the interval excludes zero, the coefficient is statistically significant at 5%
conf_int = fit_full.confint()
print("\n95% Confidence Intervals:")
print(conf_int.round(2))
# Manual calculation for the size coefficient
coef_size = fit_full.coef()['size']
se_size = fit_full.se()['size']
t_crit = stats.t.ppf(0.975, df) # 0.975 = upper tail for two-sided 95% CI
ci_lower = coef_size - t_crit * se_size
ci_upper = coef_size + t_crit * se_size
print(f"\nSize 95% CI: [${ci_lower:.2f}, ${ci_upper:.2f}] (excludes zero \u2192 significant)")
# =============================================================================
# STEP 4: Hypothesis test on a single coefficient
# =============================================================================
# Test H\u2080: \u03b2_size = 50 vs H\u2081: \u03b2_size \u2260 50
# t = (b_j - hypothesized value) / se(b_j)
null_value = 50
t_stat = (coef_size - null_value) / se_size
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df))
print(f"\nTest H\u2080: \u03b2_size = {null_value}")
print(f" t-statistic: {t_stat:.4f}")
print(f" p-value: {p_value:.4f}")
print(f" Decision: {'Reject' if p_value < 0.05 else 'Fail to reject'} H\u2080 at 5% level")
# Test of statistical significance: H\u2080: \u03b2_size = 0
t_stat_zero = coef_size / se_size
print(f"\nTest H\u2080: \u03b2_size = 0")
print(f" t-statistic: {t_stat_zero:.4f}, p-value: {fit_full.pvalue()['size']:.6f}")
print(f" Size IS statistically significant at 5% level")
# =============================================================================
# STEP 5: Joint F-test — are groups of coefficients jointly significant?
# =============================================================================
# Overall F-test: H\u2080: all slope coefficients = 0
overall_F = fit_full.wald_test()
print(f"\nOverall F-test: F = {overall_F.stat.values[0]:.4f}, p = {overall_F.pval.values[0]:.6e}")
print(f" At least one variable matters \u2192 model is jointly significant")
# Subset F-test: are variables other than size jointly significant?
# H\u2080: \u03b2_bedrooms = \u03b2_bathrooms = \u03b2_lotsize = \u03b2_age = \u03b2_monthsold = 0
# Use R matrix: each row zeroes out one coefficient
vars_to_test = ['bedrooms', 'bathrooms', 'lotsize', 'age', 'monthsold']
all_vars = list(fit_full.coef().index)
R = np.zeros((len(vars_to_test), len(all_vars)))
for i, v in enumerate(vars_to_test):
R[i, all_vars.index(v)] = 1
subset_F = fit_full.wald_test(R=R)
print(f"\nSubset F-test (drop 5 vars, keep size):")
print(f" F = {subset_F.stat.values[0]:.4f}, p = {subset_F.pval.values[0]:.4f}")
print(f" Extra variables are NOT jointly significant \u2192 simpler model preferred")
# =============================================================================
# STEP 6: Model comparison — restricted vs unrestricted
# =============================================================================
# Restricted model: only size as predictor
fit_restricted = pf.feols('price ~ size', data=data_house)
# Manual F-test: restricted vs unrestricted
rss_r = np.sum(fit_restricted._u_hat**2)
rss_u = np.sum(fit_full._u_hat**2)
q = len(fit_full.coef()) - len(fit_restricted.coef()) # number of restrictions
F_manual = ((rss_r - rss_u) / q) / (rss_u / (n - len(fit_full.coef())))
p_manual = 1 - stats.f.cdf(F_manual, q, n - len(fit_full.coef()))
print(f"\nF-test: Restricted (size only) vs Full model:")
print(f" F = {F_manual:.4f}, p = {p_manual:.4f}")
# Compare fit statistics
print(f"\n{'Model':<25} {'R\u00b2':>8} {'Adj. R\u00b2':>9}")
print("-" * 44)
for name, m in [('Size only', fit_restricted), ('Full model', fit_full)]:
print(f"{name:<25} {m._r2:>8.4f} {m._adj_r2:>9.4f}")
# =============================================================================
# STEP 7: Robust standard errors — valid under heteroskedasticity
# =============================================================================
# HC1 (White's correction) provides valid inference without constant variance
fit_robust = pf.feols('price ~ size + bedrooms + bathrooms + lotsize + age + monthsold',
data=data_house, vcov='HC1')
print("\nStandard vs Robust SEs:")
print(f"{'Variable':<14} {'Coef':>10} {'Std SE':>10} {'Robust SE':>10} {'p (robust)':>10}")
print("-" * 56)
for var in fit_full.coef().index:
print(f"{var:<14} {fit_full.coef()[var]:>10.2f} "
f"{fit_full.se()[var]:>10.2f} {fit_robust.se()[var]:>10.2f} "
f"{fit_robust.pvalue()[var]:>10.4f}")
# =============================================================================
# STEP 8: Coefficient plot — visual summary of significance
# =============================================================================
# Error bars crossing zero = not significant at 5%
params_plot = fit_full.coef()[1:] # exclude intercept
ci_plot = conf_int.iloc[1:, :]
fig, ax = plt.subplots(figsize=(10, 6))
y_pos = np.arange(len(params_plot))
ax.errorbar(params_plot.values, y_pos,
xerr=[params_plot.values - ci_plot.iloc[:, 0].values,
ci_plot.iloc[:, 1].values - params_plot.values],
fmt='o', markersize=8, capsize=5, capthick=2, linewidth=2)
ax.axvline(x=0, color='red', linestyle='--', linewidth=1.5, alpha=0.7, label='H\u2080: \u03b2 = 0')
ax.set_yticks(y_pos)
ax.set_yticklabels(params_plot.index)
ax.set_xlabel('Coefficient Value')
ax.set_title('Coefficient Estimates with 95% Confidence Intervals')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 11 CHEAT SHEET: Statistical Inference for Multiple Regression
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
summarize price size bedrooms bathrooms lotsize age
* =============================================================================
* STEP 2: Estimate the full multiple regression model
* =============================================================================
* regress fits OLS: first variable is y, the rest are x's
* Stata includes an intercept automatically (use ", noconstant" to suppress)
regress price size bedrooms bathrooms lotsize age monthsold
// After regress, Stata stores results you can reference:
display "Size effect: " _b[size] // coefficient on size
display "R-squared: " e(r2) // R-squared
display "Obs: " e(N) // number of observations
display "df_residual: " e(df_r) // degrees of freedom (n - k)
* =============================================================================
* STEP 3: Confidence intervals — does the CI exclude zero?
* =============================================================================
* By default, regress output shows 95% CIs in the last two columns
* Use ", level(XX)" to change the confidence level (e.g., 90% or 99%)
regress price size bedrooms bathrooms lotsize age monthsold, level(95)
// Manual calculation for the size coefficient
display "Size 95% CI lower: " _b[size] - invttail(e(df_r), 0.025) * _se[size]
display "Size 95% CI upper: " _b[size] + invttail(e(df_r), 0.025) * _se[size]
// invttail(df, p) returns the critical value; 0.025 for two-sided 95% CI
// Change to 99% CI to see the interval widen
regress price size bedrooms bathrooms lotsize age monthsold, level(99)
* =============================================================================
* STEP 4: Hypothesis test on a single coefficient
* =============================================================================
* After regress, Stata's output already includes t-stats and p-values for
* the standard test H0: beta = 0.
regress price size bedrooms bathrooms lotsize age monthsold
// Test a non-zero null: H0: beta_size = 50 vs H1: beta_size != 50
// test performs a Wald test on the most recent estimation
test size = 50
// Stata reports F-statistic and p-value; for a single restriction, F = t^2
// lincom computes the estimate, SE, t-stat, and CI for a linear combination
lincom size - 50 // same test via linear combination
* =============================================================================
* STEP 5: Joint F-test — are groups of coefficients jointly significant?
* =============================================================================
* Overall F-test is shown automatically in regress output (top-right corner)
* It tests H0: all slope coefficients = 0
// Subset F-test: are the 5 extra variables jointly significant?
// H0: beta_bedrooms = beta_bathrooms = beta_lotsize = beta_age = beta_monthsold = 0
test bedrooms bathrooms lotsize age monthsold
// Stata reports F(5, 22) and p-value
// If p > 0.05 -> fail to reject -> the extras are not jointly significant
* =============================================================================
* STEP 6: Model comparison — restricted vs unrestricted
* =============================================================================
* Estimate the restricted model: only size as predictor
regress price size
estimates store model_restricted // save results under a name
* Estimate the full (unrestricted) model
regress price size bedrooms bathrooms lotsize age monthsold
estimates store model_full // save results under a name
// Compare models side by side with esttab (requires estout package)
// ssc install estout // run once to install
esttab model_restricted model_full, ///
se r2 ar2 stats(N r2 r2_a F, labels("N" "R²" "Adj. R²" "F")) ///
title("Model Comparison: Restricted vs Full")
// Formal F-test: does the full model fit significantly better?
// lrtest compares two nested models via likelihood ratio
lrtest model_restricted model_full
* =============================================================================
* STEP 7: Robust standard errors — valid under heteroskedasticity
* =============================================================================
* Add ", robust" (or ", vce(robust)") to get HC1 robust standard errors
* Coefficients stay the same; only SEs, t-stats, and p-values change
regress price size bedrooms bathrooms lotsize age monthsold
estimates store model_standard
regress price size bedrooms bathrooms lotsize age monthsold, vce(robust)
estimates store model_robust
// Compare standard vs robust SEs side by side
esttab model_standard model_robust, ///
se title("Standard vs Robust (HC1) Standard Errors") ///
mtitles("Standard" "Robust")
* =============================================================================
* STEP 8: Coefficient plot — visual summary of significance
* =============================================================================
* coefplot draws coefficient estimates with confidence intervals
* Error bars crossing zero = not significant at 5%
regress price size bedrooms bathrooms lotsize age monthsold
coefplot, drop(_cons) xline(0, lcolor(red) lpattern(dash)) ///
title("Coefficient Estimates with 95% CIs") ///
xtitle("Coefficient Value") ///
levels(95)
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 11 CHEAT SHEET: Statistical Inference for Multiple Regression
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house <- read_dta(url)
cat("Dataset:", nrow(data_house), "observations,", ncol(data_house), "variables\n")
# =============================================================================
# STEP 2: Estimate the full multiple regression model
# =============================================================================
model_full <- feols(price ~ size + bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house)
summary(model_full)
cat("Size effect: $", round(coef(model_full)["size"], 2), "per sq ft\n")
cat("R-squared:", round(r2(model_full), 4), "\n")
# =============================================================================
# STEP 3: Confidence intervals — does the CI exclude zero?
# =============================================================================
# confint() computes 95% CIs; if the interval excludes zero, significant at 5%
confint(model_full, level = 0.95)
# =============================================================================
# STEP 4: Hypothesis test on a single coefficient
# =============================================================================
# Test H0: beta_size = 50 vs H1: beta_size != 50
# wald() from fixest tests linear hypotheses
wald(model_full, "size = 50")
# =============================================================================
# STEP 5: Joint F-test — are groups of coefficients jointly significant?
# =============================================================================
# Overall F-test is shown in summary output
# Subset F-test: are the 5 extra variables jointly significant?
wald(model_full, c("bedrooms", "bathrooms", "lotsize", "age", "monthsold"))
# =============================================================================
# STEP 6: Model comparison — restricted vs unrestricted
# =============================================================================
model_restricted <- feols(price ~ size, data = data_house)
etable(model_restricted, model_full,
headers = c("Size only", "Full model"),
fitstat = c("r2", "ar2", "n"))
# =============================================================================
# STEP 7: Robust standard errors — valid under heteroskedasticity
# =============================================================================
# vcov = "HC1" provides HC1 (White's correction) robust SEs
model_robust <- feols(price ~ size + bedrooms + bathrooms + lotsize + age + monthsold,
data = data_house, vcov = "HC1")
etable(model_full, model_robust,
headers = c("Standard", "Robust (HC1)"))
# =============================================================================
# STEP 8: Coefficient plot — visual summary of significance
# =============================================================================
# coefplot() from fixest draws coefficient estimates with CIs
# Error bars crossing zero = not significant at 5%
coefplot(model_full, drop = "(Intercept)")
Paste into your R console or RStudio