Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 13 CHEAT SHEET: Case Studies for Multiple Regression
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations and log transforms
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast OLS, IV, and robust inference
# !pip install pyfixest # uncomment if running in Google Colab
# --- Data source ---
URL = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/"
# =============================================================================
# STEP 1: Cobb-Douglas Production Function — log-linearize and estimate
# =============================================================================
# Taking logs converts Q = A*K^alpha*L^beta into a linear model estimable by OLS
data_cobb = pd.read_stata(URL + "AED_COBBDOUGLAS.DTA")
data_cobb['lnq'] = np.log(data_cobb['q'])
data_cobb['lnk'] = np.log(data_cobb['k'])
data_cobb['lnl'] = np.log(data_cobb['l'])
# OLS with Newey-West (HAC) standard errors (time series data)
data_cobb['_time'] = range(len(data_cobb)) # pyfixest NW needs a time index
fit_cobb = pf.feols('lnq ~ lnk + lnl', data=data_cobb,
vcov='NW', vcov_kwargs={'time_id': '_time', 'lag': 3})
alpha = fit_cobb.coef()['lnk'] # capital elasticity
beta = fit_cobb.coef()['lnl'] # labor elasticity
print(f"Capital elasticity α = {alpha:.3f}, Labor elasticity β = {beta:.3f}")
print(f"Sum α + β = {alpha + beta:.3f} (≈1 → constant returns to scale)")
print(f"R² = {fit_cobb._r2:.4f}")
fit_cobb.summary()
# F-test for constant returns to scale: H0: α + β = 1
data_cobb['lnq_l'] = data_cobb['lnq'] - data_cobb['lnl']
data_cobb['lnk_l'] = data_cobb['lnk'] - data_cobb['lnl']
fit_r = pf.feols('lnq_l ~ lnk_l', data=data_cobb)
from scipy import stats
ssr_r = np.sum(fit_r._u_hat**2)
ssr_u = np.sum(fit_cobb._u_hat**2)
df_resid = len(data_cobb) - 3
f_stat = ((ssr_r - ssr_u) / 1) / (ssr_u / df_resid)
p_val = 1 - stats.f.cdf(f_stat, 1, df_resid)
print(f"CRS test: F = {f_stat:.2f}, p = {p_val:.3f} → {'Fail to reject' if p_val > 0.05 else 'Reject'} CRS")
# =============================================================================
# STEP 2: Phillips Curve — omitted variables bias reverses the sign
# =============================================================================
# Pre-1970 the trade-off works; post-1970 it breaks because expected inflation
# is omitted — a textbook demonstration of OVB
data_phil = pd.read_stata(URL + "AED_PHILLIPS.DTA")
# Pre-1970: classic negative relationship
pre = data_phil[data_phil['year'] < 1970].copy()
pre['_time'] = range(len(pre))
fit_pre = pf.feols('inflgdp ~ urate', data=pre,
vcov='NW', vcov_kwargs={'time_id': '_time', 'lag': 3})
print(f"\nPre-1970 slope: {fit_pre.coef()['urate']:.3f} (negative → classic Phillips curve)")
# Post-1970: sign flips due to omitted expected inflation
post = data_phil[data_phil['year'] >= 1970].copy()
post['_time'] = range(len(post))
fit_post = pf.feols('inflgdp ~ urate', data=post,
vcov='NW', vcov_kwargs={'time_id': '_time', 'lag': 5})
print(f"Post-1970 slope: {fit_post.coef()['urate']:.3f} (positive → breakdown!)")
# Augmented model: adding expected inflation restores the negative sign
post_exp = post.dropna(subset=['inflgdp1yr']).copy()
post_exp['_time'] = range(len(post_exp))
fit_aug = pf.feols('inflgdp ~ urate + inflgdp1yr', data=post_exp,
vcov='NW', vcov_kwargs={'time_id': '_time', 'lag': 5})
print(f"Augmented slope on urate: {fit_aug.coef()['urate']:.3f} (negative again!)")
print(f"Expected inflation coef: {fit_aug.coef()['inflgdp1yr']:.3f}")
# OVB formula: E[b2] = β2 + β3*γ
fit_aux = pf.feols('inflgdp1yr ~ urate', data=post_exp)
predicted = fit_aug.coef()['urate'] + fit_aug.coef()['inflgdp1yr'] * fit_aux.coef()['urate']
print(f"OVB predicted bivariate slope: {predicted:.3f} (actual: {fit_post.coef()['urate']:.3f})")
# =============================================================================
# STEP 3: RAND Health Insurance Experiment — RCT as the gold standard
# =============================================================================
# Random assignment to insurance plans eliminates selection bias
data_rand = pd.read_stata(URL + "AED_HEALTHINSEXP.DTA")
data_rand_y1 = data_rand[data_rand['year'] == 1]
# Mean spending by plan
print(f"\n{'Plan':<12} {'Mean $':>10} {'N':>8}")
for plan, grp in data_rand_y1.groupby('plan')['spending']:
print(f"{plan:<12} {grp.mean():>10,.0f} {len(grp):>8,}")
# Regression with cluster-robust SEs by family
fit_rct = pf.feols('spending ~ coins25 + coins50 + coins95 + coinsmixed + coinsindiv',
data=data_rand_y1, vcov={'CRV1': 'idfamily'})
fit_rct.summary()
# Joint F-test: do insurance plans matter?
# Use the model's built-in Wald test
from scipy.stats import f as f_dist
coefs_test = ['coins25', 'coins50', 'coins95', 'coinsmixed', 'coinsindiv']
print(f"Joint F-test for plan effects — check Wald test in summary above")
# =============================================================================
# STEP 4: Difference-in-Differences — health clinic access in South Africa
# =============================================================================
# DiD removes both time-invariant confounders and common time trends
data_did = pd.read_stata(URL + "AED_HEALTHACCESS.DTA")
# Manual DiD calculation
pre_c = data_did[(data_did['hightreat']==0) & (data_did['post']==0)]['waz'].mean()
post_c = data_did[(data_did['hightreat']==0) & (data_did['post']==1)]['waz'].mean()
pre_t = data_did[(data_did['hightreat']==1) & (data_did['post']==0)]['waz'].mean()
post_t = data_did[(data_did['hightreat']==1) & (data_did['post']==1)]['waz'].mean()
did = (post_t - pre_t) - (post_c - pre_c)
print(f"\nDiD estimate (manual): {did:.3f} SD improvement in child nutrition")
# DiD regression with cluster-robust SEs by community
fit_did = pf.feols('waz ~ hightreat + post + postXhigh', data=data_did,
vcov={'CRV1': 'idcommunity'})
print(f"DiD coefficient (regression): {fit_did.coef()['postXhigh']:.3f}")
print(f"p-value: {fit_did.pvalue()['postXhigh']:.4f}")
# =============================================================================
# STEP 5: Regression Discontinuity — incumbency advantage in U.S. Senate
# =============================================================================
# Candidates who barely win vs barely lose are quasi-randomly assigned to
# incumbent status — the jump at margin = 0 is the causal effect
data_rd = pd.read_stata(URL + "AED_INCUMBENCY.DTA")
data_rd = data_rd[data_rd['vote'].notna()].copy()
fit_rd = pf.feols('vote ~ win + margin', data=data_rd, vcov='HC1')
print(f"\nIncumbency advantage: {fit_rd.coef()['win']:.1f} percentage points")
print(f"95% CI: [{fit_rd.confint().loc['win', '2.5%']:.1f}, {fit_rd.confint().loc['win', '97.5%']:.1f}]")
print(f"p-value: {fit_rd.pvalue()['win']:.4f}")
# =============================================================================
# STEP 6: Instrumental Variables — do institutions cause growth?
# =============================================================================
# OLS is biased by reverse causation; settler mortality instruments for
# modern institutions (relevant + exogenous to modern GDP)
data_iv = pd.read_stata(URL + "AED_INSTITUTIONS.DTA")
# OLS (biased)
fit_ols = pf.feols('logpgp95 ~ avexpr', data=data_iv, vcov='HC1')
# IV / 2SLS — pyfixest handles both stages automatically
# Syntax: 'depvar ~ exogenous | endogenous ~ instrument'
fit_iv = pf.feols('logpgp95 ~ 1 | avexpr ~ logem4', data=data_iv, vcov='HC1')
# First stage: institutions ~ settler mortality
fit_1st = pf.feols('avexpr ~ logem4', data=data_iv, vcov='HC1')
print(f"\nFirst-stage F = {fit_1st.coef()['logem4'] / fit_1st.se()['logem4']:.1f}² → check instrument relevance")
print(f"OLS coefficient: {fit_ols.coef()['avexpr']:.3f} (biased)")
print(f"IV/2SLS coefficient: {fit_iv.coef()['avexpr']:.3f} (causal)")
print(f"Causal effect: 1-unit improvement in institutions → "
f"{np.exp(fit_iv.coef()['avexpr']):.1f}x increase in GDP")
fit_iv.summary()
* =============================================================================
* CHAPTER 13 CHEAT SHEET: Case Studies for Multiple Regression
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* --- Data source ---
global URL "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/"
* =============================================================================
* STEP 1: Cobb-Douglas Production Function — log-linearize and estimate
* =============================================================================
* Taking logs converts Q = A*K^alpha*L^beta into a linear model estimable by OLS
use "${URL}AED_COBBDOUGLAS.DTA", clear
* Generate log-transformed variables
gen lnq = ln(q) // log of output
gen lnk = ln(k) // log of capital
gen lnl = ln(l) // log of labor
* OLS with Newey-West (HAC) standard errors — 3 lags for time series
newey lnq lnk lnl, lag(3)
* Display elasticities — coefficients ARE the elasticities in the log model
display "Capital elasticity α = " _b[lnk]
display "Labor elasticity β = " _b[lnl]
display "Sum α + β = " _b[lnk] + _b[lnl]
* F-test for constant returns to scale: H0: α + β = 1
test lnk + lnl = 1
* A large p-value means we cannot reject CRS (doubling inputs doubles output)
* =============================================================================
* STEP 2: Phillips Curve — omitted variables bias reverses the sign
* =============================================================================
* Pre-1970 the trade-off works; post-1970 it breaks because expected inflation
* is omitted — a textbook demonstration of OVB
use "${URL}AED_PHILLIPS.DTA", clear
* Pre-1970: classic negative relationship
newey inflgdp urate if year < 1970, lag(3)
display "Pre-1970 slope: " _b[urate] " (negative → classic Phillips curve)"
* Post-1970: sign flips due to omitted expected inflation
newey inflgdp urate if year >= 1970, lag(5)
scalar slope_post = _b[urate]
display "Post-1970 slope: " slope_post " (positive → breakdown!)"
* Augmented model: adding expected inflation restores the negative sign
newey inflgdp urate inflgdp1yr if year >= 1970, lag(5)
display "Augmented slope on urate: " _b[urate] " (negative again!)"
display "Expected inflation coef: " _b[inflgdp1yr]
* OVB formula: E[b2] = β2 + β3*γ
* First get γ from the auxiliary regression of omitted var on included var
scalar b2 = _b[urate] // true coefficient on urate
scalar b3 = _b[inflgdp1yr] // coefficient on omitted variable
regress inflgdp1yr urate if year >= 1970 & inflgdp1yr != .
scalar gamma_hat = _b[urate] // correlation between omitted and included
display "OVB predicted slope: " b2 + b3 * gamma_hat " (actual: " slope_post ")"
* =============================================================================
* STEP 3: RAND Health Insurance Experiment — RCT as the gold standard
* =============================================================================
* Random assignment to insurance plans eliminates selection bias
use "${URL}AED_HEALTHINSEXP.DTA", clear
keep if year == 1 // keep only year 1 observations
* Mean spending by plan
tabulate plan, summarize(spending) means
* Regression with cluster-robust SEs by family
regress spending coins25 coins50 coins95 coinsmixed coinsindiv, vce(cluster idfamily)
* Joint F-test: do insurance plans matter?
test coins25 coins50 coins95 coinsmixed coinsindiv
* Rejects → insurance plan systematically shifts spending
* =============================================================================
* STEP 4: Difference-in-Differences — health clinic access in South Africa
* =============================================================================
* DiD removes both time-invariant confounders and common time trends
use "${URL}AED_HEALTHACCESS.DTA", clear
* Manual DiD calculation — compute group means
tabulate hightreat post, summarize(waz) means
* DiD = (treat_post - treat_pre) - (ctrl_post - ctrl_pre)
* DiD regression with cluster-robust SEs by community
regress waz hightreat post postXhigh, vce(cluster idcommunity)
display "DiD coefficient: " _b[postXhigh]
display "p-value: " 2 * ttail(e(df_r), abs(_b[postXhigh] / _se[postXhigh]))
* =============================================================================
* STEP 5: Regression Discontinuity — incumbency advantage in U.S. Senate
* =============================================================================
* Candidates who barely win vs barely lose are quasi-randomly assigned to
* incumbent status — the jump at margin = 0 is the causal effect
use "${URL}AED_INCUMBENCY.DTA", clear
drop if vote == . // drop missing outcomes
* RD regression with robust standard errors
regress vote win margin, vce(robust)
display "Incumbency advantage: " _b[win] " percentage points"
display "p-value: " 2 * ttail(e(df_r), abs(_b[win] / _se[win]))
* 95% confidence interval for the incumbency effect
lincom win // reports coef, SE, t, p, and CI
* =============================================================================
* STEP 6: Instrumental Variables — do institutions cause growth?
* =============================================================================
* OLS is biased by reverse causation; settler mortality instruments for
* modern institutions (relevant + exogenous to modern GDP)
use "${URL}AED_INSTITUTIONS.DTA", clear
* OLS (biased) — institutions predict GDP but direction of causation is unclear
regress logpgp95 avexpr, vce(robust)
display "OLS coefficient: " _b[avexpr] " (biased by reverse causation)"
* First stage: do settler mortality rates predict modern institutions?
regress avexpr logem4, vce(robust)
display "First-stage F = " e(F) " (F > 10 → strong instrument)"
* IV / 2SLS in one step — Stata handles both stages automatically
ivregress 2sls logpgp95 (avexpr = logem4), vce(robust)
display "IV/2SLS coefficient: " _b[avexpr] " (causal estimate)"
display "Causal effect: 1-unit improvement → " exp(_b[avexpr]) "x GDP increase"
* First-stage diagnostics
estat firststage // reports first-stage F-statistic
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 13 CHEAT SHEET: Case Studies for Multiple Regression
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS, IV, and HAC estimation
library(dplyr) # data manipulation
# --- Data source ---
URL <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/"
# =============================================================================
# STEP 1: Cobb-Douglas Production Function — log-linearize and estimate
# =============================================================================
data_cd <- read_dta(paste0(URL, "AED_COBBDOUGLAS.DTA")) |>
mutate(lnq = log(q), lnk = log(k), lnl = log(l))
# Newey-West (HAC) standard errors — 3 lags for time series
model_cd <- feols(lnq ~ lnk + lnl, data = data_cd, vcov = NW(3))
summary(model_cd)
cat("Capital elasticity α =", round(coef(model_cd)["lnk"], 4), "\n")
cat("Labor elasticity β =", round(coef(model_cd)["lnl"], 4), "\n")
cat("Sum α + β =", round(sum(coef(model_cd)[c("lnk","lnl")]), 4), "\n")
# F-test for constant returns to scale: H0: α + β = 1
wald(model_cd, "lnk + lnl = 1")
# =============================================================================
# STEP 2: Phillips Curve — omitted variables bias reverses the sign
# =============================================================================
data_phil <- read_dta(paste0(URL, "AED_PHILLIPS.DTA"))
# Pre-1970: classic negative relationship
m_pre <- feols(inflgdp ~ urate, data = data_phil |> filter(year < 1970),
vcov = NW(3))
cat("\nPre-1970 slope:", round(coef(m_pre)["urate"], 4), "(negative → classic)\n")
# Post-1970: sign flips due to omitted expected inflation
m_post <- feols(inflgdp ~ urate, data = data_phil |> filter(year >= 1970),
vcov = NW(5))
cat("Post-1970 slope:", round(coef(m_post)["urate"], 4), "(positive → breakdown!)\n")
# Augmented model: adding expected inflation restores the negative sign
m_aug <- feols(inflgdp ~ urate + inflgdp1yr,
data = data_phil |> filter(year >= 1970), vcov = NW(5))
cat("Augmented slope:", round(coef(m_aug)["urate"], 4), "(negative again!)\n")
# =============================================================================
# STEP 3: RAND Health Insurance Experiment — RCT as the gold standard
# =============================================================================
data_rand <- read_dta(paste0(URL, "AED_HEALTHINSEXP.DTA")) |>
filter(year == 1)
model_rct <- feols(spending ~ coins25 + coins50 + coins95 + coinsmixed + coinsindiv,
data = data_rand, vcov = ~idfamily)
summary(model_rct)
# Joint F-test: do insurance plans matter?
wald(model_rct, c("coins25", "coins50", "coins95", "coinsmixed", "coinsindiv"))
# =============================================================================
# STEP 4: Difference-in-Differences — health clinic access
# =============================================================================
data_did <- read_dta(paste0(URL, "AED_HEALTHACCESS.DTA"))
model_did <- feols(waz ~ hightreat + post + postXhigh,
data = data_did, vcov = ~idcommunity)
cat("\nDiD coefficient:", round(coef(model_did)["postXhigh"], 3), "\n")
cat("p-value:", round(pvalue(model_did)["postXhigh"], 4), "\n")
# =============================================================================
# STEP 5: Regression Discontinuity — incumbency advantage
# =============================================================================
data_rd <- read_dta(paste0(URL, "AED_INCUMBENCY.DTA")) |>
filter(!is.na(vote))
model_rd <- feols(vote ~ win + margin, data = data_rd, vcov = "HC1")
cat("\nIncumbency advantage:", round(coef(model_rd)["win"], 1), "percentage points\n")
cat("p-value:", round(pvalue(model_rd)["win"], 4), "\n")
# =============================================================================
# STEP 6: Instrumental Variables — do institutions cause growth?
# =============================================================================
data_inst <- read_dta(paste0(URL, "AED_INSTITUTIONS.DTA"))
# OLS (biased by reverse causation)
model_ols <- feols(logpgp95 ~ avexpr, data = data_inst, vcov = "HC1")
cat("\nOLS coefficient:", round(coef(model_ols)["avexpr"], 4), "(biased)\n")
# First stage: settler mortality predicts modern institutions
model_1st <- feols(avexpr ~ logem4, data = data_inst, vcov = "HC1")
cat("First-stage F =", round(fitstat(model_1st, "f")$f$stat, 1),
"(F > 10 → strong instrument)\n")
# IV / 2SLS — fixest handles both stages automatically
# Syntax: y ~ exogenous | endogenous ~ instrument
model_iv <- feols(logpgp95 ~ 1 | avexpr ~ logem4, data = data_inst, vcov = "HC1")
summary(model_iv, stage = 1:2)
cat("IV/2SLS coefficient:", round(coef(model_iv)["fit_avexpr"], 4), "(causal)\n")
# Note: in some fixest versions the name may be "avexpr" instead of "fit_avexpr"
# Use coef(model_iv) to check coefficient names in your version
Paste into your R console or RStudio