Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 7 CHEAT SHEET: Statistical Inference for Bivariate Regression
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast OLS estimation with feols()
# !pip install pyfixest # uncomment if running in Google Colab
from scipy import stats # t-distribution and critical values
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files — the house price dataset has 29 houses
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house = pd.read_stata(url)
print(f"Dataset: {data_house.shape[0]} observations, {data_house.shape[1]} variables")
# =============================================================================
# STEP 2: Estimate the regression and extract key statistics
# =============================================================================
# The t-statistic measures how many standard errors the estimate is from zero
fit = pf.feols('price ~ size', data=data_house)
slope = fit.coef()['size'] # marginal effect: $/sq ft
intercept = fit.coef()['Intercept']
se_slope = fit.se()['size'] # standard error of the slope
t_stat = fit.tstat()['size'] # t = b2 / se(b2)
p_value = fit.pvalue()['size'] # two-sided p-value for H0: b2 = 0
print(f"Estimated equation: price = {intercept:,.0f} + {slope:.2f} x size")
print(f"Standard error of slope: {se_slope:.2f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.6f}")
# Full regression table (coefficients, std errors, t-stats, p-values, R2)
fit.summary()
# =============================================================================
# STEP 3: Confidence interval — a range of plausible values for b2
# =============================================================================
# CI = b2 +/- t_crit x se(b2), using T(n-2) distribution
n = len(data_house)
df = n - 2 # degrees of freedom
t_crit = stats.t.ppf(0.975, df) # critical value for 95% CI
ci_lower = slope - t_crit * se_slope
ci_upper = slope + t_crit * se_slope
print(f"Degrees of freedom: {df}")
print(f"Critical t-value (alpha=0.05, two-sided): {t_crit:.4f}")
print(f"95% CI for slope: [{ci_lower:.2f}, {ci_upper:.2f}]")
print(f"Interpretation: each sq ft adds between ${ci_lower:.0f} and ${ci_upper:.0f} to price")
# =============================================================================
# STEP 4: Hypothesis tests — does size matter? Is the effect $90/sq ft?
# =============================================================================
# Test 1: Statistical significance (H0: b2 = 0)
print(f"Test H0: b2 = 0 -> t = {t_stat:.2f}, p = {p_value:.6f} -> Reject H0")
# Test 2: Two-sided test for a specific value (H0: b2 = 90)
null_value = 90
t_90 = (slope - null_value) / se_slope
p_90 = 2 * (1 - stats.t.cdf(abs(t_90), df))
print(f"Test H0: b2 = 90 -> t = {t_90:.4f}, p = {p_90:.4f} -> Fail to reject H0")
print(f" (90 is inside the 95% CI [{ci_lower:.2f}, {ci_upper:.2f}])")
# =============================================================================
# STEP 5: One-sided test — does size increase price by less than $90/sq ft?
# =============================================================================
# H0: b2 >= 90 vs Ha: b2 < 90 (lower-tailed test)
p_lower = stats.t.cdf(t_90, df) # one-sided p-value (left tail)
print(f"One-sided test H0: b2 >= 90 vs Ha: b2 < 90")
print(f" t = {t_90:.4f}, one-sided p = {p_lower:.4f}")
print(f" Fail to reject at 5% (p = {p_lower:.3f} > 0.05)")
print(f" Would reject at 10% (p = {p_lower:.3f} < 0.10)")
# =============================================================================
# STEP 6: Robust standard errors — valid with or without heteroskedasticity
# =============================================================================
# HC1 robust SEs protect against non-constant variance in the errors
fit_robust = pf.feols('price ~ size', data=data_house, vcov='HC1')
print(f"{'':20s} {'Standard':>12s} {'Robust (HC1)':>12s}")
print("-" * 46)
print(f"{'SE(size)':<20s} {se_slope:>12.2f} {fit_robust.se()['size']:>12.2f}")
print(f"{'t-statistic':<20s} {t_stat:>12.2f} {fit_robust.tstat()['size']:>12.2f}")
print(f"{'p-value':<20s} {p_value:>12.6f} {fit_robust.pvalue()['size']:>12.6f}")
* =============================================================================
* CHAPTER 7 CHEAT SHEET: Statistical Inference for Bivariate Regression
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Estimate the regression and extract key statistics
* =============================================================================
* regress fits OLS; the t-statistic measures how many SEs the estimate is from 0
regress price size
// After regress, Stata stores results you can reference:
display "Slope (b2): " _b[size] // estimated coefficient
display "Intercept (b1): " _b[_cons]
display "SE of slope: " _se[size] // standard error of the slope
display "t-statistic: " _b[size] / _se[size]
* =============================================================================
* STEP 3: Confidence interval — a range of plausible values for b2
* =============================================================================
* The confidence interval is shown in the regress output (last two columns)
* CI = b2 +/- t_crit x se(b2), using T(n-2) distribution
local n = e(N) // number of observations
local df = `n' - 2 // degrees of freedom
display "Degrees of freedom: " `df'
// Critical t-value for 95% CI (two-sided, alpha/2 = 0.025)
display "Critical t-value (alpha=0.05): " invttail(`df', 0.025)
// Display confidence interval bounds
local slope = _b[size]
local se = _se[size]
local tcrit = invttail(`df', 0.025)
display "95% CI for slope: [" `slope' - `tcrit' * `se' ", " `slope' + `tcrit' * `se' "]"
* =============================================================================
* STEP 4: Hypothesis tests — does size matter? Is the effect $90/sq ft?
* =============================================================================
* Test 1: Statistical significance (H0: b2 = 0)
* This is the default test shown in the regress output
test size // F-test equivalent of t-test for H0: b2 = 0
* Test 2: Two-sided test for a specific value (H0: b2 = 90)
* lincom computes a linear combination and tests if it equals zero
* So lincom size - 90 tests H0: b2 - 90 = 0, i.e., H0: b2 = 90
lincom size - 90
// Alternative: use test with an explicit null value
test size = 90
* =============================================================================
* STEP 5: One-sided test — does size increase price by less than $90/sq ft?
* =============================================================================
* H0: b2 >= 90 vs Ha: b2 < 90 (lower-tailed test)
* Stata's lincom gives a two-sided p-value; divide by 2 for one-sided
// Compute t-statistic for H0: b2 = 90
local t_90 = (_b[size] - 90) / _se[size]
display "t-statistic for H0: b2 = 90: " `t_90'
// One-sided p-value (left tail): Pr(T < t_90)
display "One-sided p-value (Ha: b2 < 90): " ttail(`df', -`t_90')
* =============================================================================
* STEP 6: Robust standard errors — valid with or without heteroskedasticity
* =============================================================================
* vce(robust) computes HC1 robust SEs, protecting against heteroskedasticity
// Standard OLS (for comparison)
quietly regress price size
local se_std = _se[size]
local t_std = _b[size] / _se[size]
// Robust HC1 standard errors
regress price size, vce(robust)
display ""
display "Comparison of Standard vs. Robust (HC1) Standard Errors:"
display "SE(size): Standard = " `se_std' " Robust = " _se[size]
display "t-statistic: Standard = " `t_std' " Robust = " _b[size] / _se[size]
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 7 CHEAT SHEET: Statistical Inference for Bivariate Regression
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house <- read_dta(url)
cat("Dataset:", nrow(data_house), "observations,", ncol(data_house), "variables\n")
# =============================================================================
# STEP 2: Estimate the regression and extract key statistics
# =============================================================================
# feols() estimates OLS; the t-statistic measures how many SEs from zero
model <- feols(price ~ size, data = data_house)
summary(model)
slope <- coef(model)["size"]
se_slope <- se(model)["size"]
t_stat <- tstat(model)["size"]
p_value <- pvalue(model)["size"]
cat("Slope:", round(slope, 2), " SE:", round(se_slope, 2),
" t:", round(t_stat, 4), " p:", round(p_value, 6), "\n")
# =============================================================================
# STEP 3: Confidence interval — a range of plausible values for b2
# =============================================================================
# CI = b2 +/- t_crit x se(b2), using T(n-2) distribution
n <- nrow(data_house)
df <- n - 2
t_crit <- qt(0.975, df)
ci <- confint(model) # 95% CI for all coefficients
cat("95% CI for slope: [", round(ci["size", 1], 2), ",",
round(ci["size", 2], 2), "]\n")
# =============================================================================
# STEP 4: Hypothesis tests — does size matter? Is the effect $90/sq ft?
# =============================================================================
# Test 1: H0: b2 = 0 (shown in summary output)
cat("Test H0: b2 = 0 -> t =", round(t_stat, 2),
", p =", round(p_value, 6), "-> Reject H0\n")
# Test 2: H0: b2 = 90 (manual t-test)
null_value <- 90
t_90 <- (slope - null_value) / se_slope
p_90 <- 2 * pt(-abs(t_90), df = df)
cat("Test H0: b2 = 90 -> t =", round(t_90, 4),
", p =", round(p_90, 4), "-> Fail to reject H0\n")
# =============================================================================
# STEP 5: One-sided test — does size increase price by less than $90/sq ft?
# =============================================================================
# H0: b2 >= 90 vs Ha: b2 < 90 (lower-tailed test)
p_lower <- pt(t_90, df = df) # one-sided p-value (left tail)
cat("One-sided test H0: b2 >= 90 vs Ha: b2 < 90\n")
cat(" t =", round(t_90, 4), ", one-sided p =", round(p_lower, 4), "\n")
# =============================================================================
# STEP 6: Robust standard errors — valid with or without heteroskedasticity
# =============================================================================
# vcov = "HC1" computes HC1 robust SEs (same as Stata's , robust)
model_robust <- feols(price ~ size, data = data_house, vcov = "HC1")
cat("\nComparison of Standard vs. Robust (HC1) Standard Errors:\n")
cat("SE(size): Standard =", round(se_slope, 2),
" Robust =", round(se(model_robust)["size"], 2), "\n")
cat("t-statistic: Standard =", round(t_stat, 2),
" Robust =", round(tstat(model_robust)["size"], 2), "\n")
Paste into your R console or RStudio