Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 8 CHEAT SHEET: Case Studies for Bivariate Regression
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast OLS estimation with feols()
# !pip install pyfixest # uncomment if running in Google Colab
# =============================================================================
# STEP 1: Load OECD health data from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files — this dataset covers 34 OECD countries
url_health = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HEALTH2009.DTA"
data_health = pd.read_stata(url_health)
print(f"Health dataset: {data_health.shape[0]} countries, {data_health.shape[1]} variables")
# =============================================================================
# STEP 2: Descriptive statistics — summarize before modeling
# =============================================================================
# .describe() gives mean, std, min, quartiles, max for each variable
print(data_health[['hlthpc', 'lifeexp', 'infmort', 'gdppc']].describe().round(2))
# =============================================================================
# STEP 3: Health outcomes regression with robust standard errors
# =============================================================================
# Does higher health spending improve life expectancy?
fit_life = pf.feols('lifeexp ~ hlthpc', data=data_health)
slope_life = fit_life.coef()['hlthpc']
r2_life = fit_life._r2
print(f"Life expectancy: slope = {slope_life:.5f}, R² = {r2_life:.4f}")
print(f"Each extra $1,000 in spending → {slope_life*1000:.2f} more years of life expectancy")
# Robust standard errors adjust for non-constant error variance (heteroskedasticity)
fit_life_robust = pf.feols('lifeexp ~ hlthpc', data=data_health, vcov='HC1')
fit_life_robust.summary()
# =============================================================================
# STEP 4: Health spending vs GDP — income elasticity
# =============================================================================
# How much of health spending is driven by national income?
fit_gdp = pf.feols('hlthpc ~ gdppc', data=data_health)
slope_gdp = fit_gdp.coef()['gdppc']
r2_gdp = fit_gdp._r2
# Income elasticity at the mean: (slope × mean_x) / mean_y
mean_gdp = data_health['gdppc'].mean()
mean_hlth = data_health['hlthpc'].mean()
elasticity = (slope_gdp * mean_gdp) / mean_hlth
print(f"Health spending on GDP: slope = {slope_gdp:.4f}, R² = {r2_gdp:.4f}")
print(f"Income elasticity at the mean: {elasticity:.2f} (≈1.0 → normal good)")
# =============================================================================
# STEP 5: Outlier robustness — excluding USA and Luxembourg
# =============================================================================
# Two countries drive much of the model's "misfit" — test robustness by excluding them
data_subset = data_health[(data_health['code'] != 'USA') &
(data_health['code'] != 'LUX')]
fit_subset = pf.feols('hlthpc ~ gdppc', data=data_subset)
print(f"\nAll 34 countries: slope = {slope_gdp:.4f}, R² = {r2_gdp:.4f}")
print(f"Excluding USA/LUX: slope = {fit_subset.coef()['gdppc']:.4f}, R² = {fit_subset._r2:.4f}")
print("Removing 2 of 34 countries transforms R² — always check for influential observations!")
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, df, ft, title in zip(
axes,
[data_health, data_subset],
[fit_gdp, fit_subset],
['All 34 Countries', 'Excluding USA & Luxembourg']):
ax.scatter(df['gdppc'], df['hlthpc'], s=50, alpha=0.7)
ax.plot(df['gdppc'], ft.predict(), color='red', linewidth=2)
ax.set_xlabel('GDP per capita ($)')
ax.set_ylabel('Health spending per capita ($)')
ax.set_title(f'{title} (R² = {ft._r2:.2f})')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: CAPM — estimating Coca-Cola's beta (systematic risk)
# =============================================================================
# Beta measures how a stock's excess return co-moves with the market excess return
url_capm = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_CAPM.DTA"
data_capm = pd.read_stata(url_capm)
fit_capm = pf.feols('rko_rf ~ rm_rf', data=data_capm)
alpha = fit_capm.coef()['Intercept'] # excess return beyond CAPM prediction
beta = fit_capm.coef()['rm_rf'] # systematic risk
r2_capm = fit_capm._r2
print(f"Coca-Cola CAPM: alpha = {alpha:.4f}, beta = {beta:.4f}, R² = {r2_capm:.4f}")
print(f"Beta < 1 → defensive stock (moves less than the market)")
print(f"R² = {r2_capm:.2%} explained by market; {1-r2_capm:.2%} is idiosyncratic risk")
# Full regression table
fit_capm.summary()
# =============================================================================
# STEP 7: Okun's Law — GDP growth vs unemployment change
# =============================================================================
# Okun (1962): each +1 point in unemployment → ≈ -2 points in GDP growth
url_gdp = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_GDPUNEMPLOY.DTA"
data_gdp = pd.read_stata(url_gdp)
fit_okun = pf.feols('rgdpgrowth ~ uratechange', data=data_gdp)
slope_okun = fit_okun.coef()['uratechange']
r2_okun = fit_okun._r2
print(f"Okun's Law: slope = {slope_okun:.2f} (Okun's original: -2.0)")
print(f"R² = {r2_okun:.4f} — unemployment explains {r2_okun*100:.0f}% of GDP growth variation")
# Scatter plot with fitted line
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_gdp['uratechange'], data_gdp['rgdpgrowth'], s=50, alpha=0.7)
ax.plot(data_gdp['uratechange'], fit_okun.predict(), color='red', linewidth=2,
label=f'Fitted: slope = {slope_okun:.2f}')
ax.axhline(y=0, color='gray', linestyle=':', linewidth=1, alpha=0.5)
ax.set_xlabel('Change in unemployment rate (percentage points)')
ax.set_ylabel('Real GDP growth (%)')
ax.set_title(f"Okun's Law: GDP Growth vs Unemployment Change (R² = {r2_okun:.2f})")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 8 CHEAT SHEET: Case Studies for Bivariate Regression
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load OECD health data from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HEALTH2009.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Descriptive statistics — summarize before modeling
* =============================================================================
* summarize gives n, mean, std, min, max for key variables
summarize hlthpc lifeexp infmort gdppc
* "detail" adds median, skewness, kurtosis, and percentiles
summarize hlthpc, detail
* =============================================================================
* STEP 3: Health outcomes regression with robust standard errors
* =============================================================================
* Does higher health spending improve life expectancy?
* regress fits OLS: depvar followed by independent variables
regress lifeexp hlthpc
// After running regress, Stata stores results you can reference:
display "Slope on hlthpc: " _b[hlthpc]
display "R-squared: " e(r2)
display "Each extra $1,000 in spending → " _b[hlthpc]*1000 " more years of life"
// Robust standard errors adjust for heteroskedasticity (non-constant error variance)
// Adding ", robust" or ", vce(robust)" re-estimates with HC1 standard errors
regress lifeexp hlthpc, vce(robust)
* =============================================================================
* STEP 4: Health spending vs GDP — income elasticity
* =============================================================================
* How much of health spending is driven by national income?
regress hlthpc gdppc
// Income elasticity at the mean: (slope × mean_x) / mean_y
// After regress, _b[gdppc] holds the slope coefficient
summarize gdppc
local mean_gdp = r(mean)
summarize hlthpc
local mean_hlth = r(mean)
local elasticity = (_b[gdppc] * `mean_gdp') / `mean_hlth'
display "Income elasticity at the mean: " `elasticity' " (≈1.0 → normal good)"
* =============================================================================
* STEP 5: Outlier robustness — excluding USA and Luxembourg
* =============================================================================
* Two countries drive much of the model's "misfit" — test robustness
* First run with all 34 countries (already done above), then exclude outliers
// "if" restricts the estimation sample without dropping data
regress hlthpc gdppc if code != "USA" & code != "LUX"
display "Excluding USA/LUX: slope = " _b[gdppc] ", R² = " e(r2)
display "Removing 2 of 34 countries transforms R² — always check for influential observations!"
// Scatter with fitted line — all countries
predict hlthpc_hat // predict fitted values from last regression
twoway (scatter hlthpc gdppc, msize(small)) ///
(lfit hlthpc gdppc, lcolor(red) lwidth(medthick)), ///
title("All 34 Countries") xtitle("GDP per capita ($)") ///
ytitle("Health spending per capita ($)") legend(off) name(all, replace)
// Scatter with fitted line — excluding USA and Luxembourg
twoway (scatter hlthpc gdppc if code != "USA" & code != "LUX", msize(small)) ///
(lfit hlthpc gdppc if code != "USA" & code != "LUX", ///
lcolor(red) lwidth(medthick)), ///
title("Excluding USA & Luxembourg") xtitle("GDP per capita ($)") ///
ytitle("Health spending per capita ($)") legend(off) name(excl, replace)
graph combine all excl, title("Outlier Robustness Check")
* =============================================================================
* STEP 6: CAPM — estimating Coca-Cola's beta (systematic risk)
* =============================================================================
* Beta measures how a stock's excess return co-moves with the market
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_CAPM.DTA", clear
regress rko_rf rm_rf
// _b[_cons] is the intercept (alpha); _b[rm_rf] is the slope (beta)
display "Alpha (excess return): " _b[_cons]
display "Beta (systematic risk): " _b[rm_rf]
display "R-squared: " e(r2)
display "Beta < 1 → defensive stock (moves less than the market)"
// Scatter plot with fitted line
twoway (scatter rko_rf rm_rf, msize(small) mcolor(navy%60)) ///
(lfit rko_rf rm_rf, lcolor(red) lwidth(medthick)), ///
xtitle("Market excess return (%)") ytitle("Coca-Cola excess return (%)") ///
title("CAPM: Coca-Cola vs Market") legend(off)
* =============================================================================
* STEP 7: Okun's Law — GDP growth vs unemployment change
* =============================================================================
* Okun (1962): each +1 point in unemployment → ≈ -2 points in GDP growth
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_GDPUNEMPLOY.DTA", clear
regress rgdpgrowth uratechange
display "Okun's Law slope: " _b[uratechange] " (Okun's original: -2.0)"
display "R-squared: " e(r2)
// predict fitted values and residuals for plotting
predict gdp_hat // fitted values from last regression
predict gdp_resid, residuals // residuals = actual - fitted
// Scatter plot with fitted line
twoway (scatter rgdpgrowth uratechange, msize(small) mcolor(navy%60)) ///
(lfit rgdpgrowth uratechange, lcolor(red) lwidth(medthick)), ///
xtitle("Change in unemployment rate (pp)") ///
ytitle("Real GDP growth (%)") ///
title("Okun's Law: GDP Growth vs Unemployment Change") ///
yline(0, lcolor(gray) lpattern(dot)) legend(off)
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 8 CHEAT SHEET: Case Studies for Bivariate Regression
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load OECD health data from a URL
# =============================================================================
url_health <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HEALTH2009.DTA"
data_health <- read_dta(url_health)
cat("Health dataset:", nrow(data_health), "countries,", ncol(data_health), "variables\n")
# =============================================================================
# STEP 2: Descriptive statistics — summarize before modeling
# =============================================================================
summary(data_health[, c("hlthpc", "lifeexp", "infmort", "gdppc")])
# =============================================================================
# STEP 3: Health outcomes regression with robust standard errors
# =============================================================================
# Does higher health spending improve life expectancy?
model_life <- feols(lifeexp ~ hlthpc, data = data_health, vcov = "HC1")
summary(model_life)
cat("Each extra $1,000 in spending \u2192",
round(coef(model_life)["hlthpc"] * 1000, 2), "more years of life\n")
# =============================================================================
# STEP 4: Health spending vs GDP — income elasticity
# =============================================================================
model_gdp <- feols(hlthpc ~ gdppc, data = data_health)
# Income elasticity at the mean: (slope x mean_x) / mean_y
elasticity <- coef(model_gdp)["gdppc"] * mean(data_health$gdppc) /
mean(data_health$hlthpc)
cat("Income elasticity at the mean:", round(elasticity, 2),
"(\u2248 1.0 \u2192 normal good)\n")
# =============================================================================
# STEP 5: Outlier robustness — excluding USA and Luxembourg
# =============================================================================
data_subset <- data_health |> filter(!code %in% c("USA", "LUX"))
model_subset <- feols(hlthpc ~ gdppc, data = data_subset)
cat("\nAll 34 countries: slope =", round(coef(model_gdp)["gdppc"], 4),
", R\u00b2 =", round(r2(model_gdp), 4), "\n")
cat("Excluding USA/LUX: slope =", round(coef(model_subset)["gdppc"], 4),
", R\u00b2 =", round(r2(model_subset), 4), "\n")
# etable() from fixest compares models side by side
etable(model_gdp, model_subset, headers = c("All", "Excl. USA/LUX"))
# =============================================================================
# STEP 6: CAPM — estimating Coca-Cola's beta (systematic risk)
# =============================================================================
url_capm <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_CAPM.DTA"
data_capm <- read_dta(url_capm)
model_capm <- feols(rko_rf ~ rm_rf, data = data_capm)
summary(model_capm)
cat("Alpha (excess return):", round(coef(model_capm)["(Intercept)"], 4), "\n")
cat("Beta (systematic risk):", round(coef(model_capm)["rm_rf"], 4), "\n")
cat("Beta < 1 \u2192 defensive stock (moves less than the market)\n")
# =============================================================================
# STEP 7: Okun's Law — GDP growth vs unemployment change
# =============================================================================
url_gdp <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_GDPUNEMPLOY.DTA"
data_gdp <- read_dta(url_gdp)
model_okun <- feols(rgdpgrowth ~ uratechange, data = data_gdp)
summary(model_okun)
cat("Okun's Law slope:", round(coef(model_okun)["uratechange"], 2),
"(Okun's original: -2.0)\n")
ggplot(data_gdp, aes(x = uratechange, y = rgdpgrowth)) +
geom_point(color = "steelblue", size = 2, alpha = 0.7) +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
linewidth = 1.2, se = FALSE) +
geom_hline(yintercept = 0, color = "gray", linetype = "dotted") +
labs(x = "Change in unemployment rate (pp)", y = "Real GDP growth (%)",
title = "Okun's Law: GDP Growth vs Unemployment Change") +
theme_minimal()
Paste into your R console or RStudio