Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 6 CHEAT SHEET: The Least Squares Estimator
# =============================================================================
# --- Libraries ---
import numpy as np # random sampling and numerical operations
import pandas as pd # data manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # OLS regression with R-style formulas
# !pip install pyfixest # uncomment if running in Google Colab
# =============================================================================
# STEP 1: Define the Data-Generating Process (DGP)
# =============================================================================
# The DGP specifies the TRUE population relationship: y = b1 + b2x + u
# We know the true parameters — in real research, we never do!
beta_1_true = 1 # true intercept
beta_2_true = 2 # true slope
sigma_u = 2 # error standard deviation
# Generate one sample of n observations
np.random.seed(42)
n = 30
x = np.random.normal(3, 1, n)
u = np.random.normal(0, sigma_u, n)
y = beta_1_true + beta_2_true * x + u
data = pd.DataFrame({'x': x, 'y': y})
print(f"Generated sample: {n} observations from y = {beta_1_true} + {beta_2_true}x + u")
# =============================================================================
# STEP 2: Fit OLS and compare sample vs. population parameters
# =============================================================================
# The sample regression estimates the unknown population line from data
fit = pf.feols('y ~ x', data=data)
b1 = fit.coef()['Intercept']
b2 = fit.coef()['x']
print(f"\nPopulation: E[y|x] = {beta_1_true} + {beta_2_true}x")
print(f"Sample: y_hat = {b1:.2f} + {b2:.2f}x")
print(f"Sampling error in slope: b2 - b2_true = {b2 - beta_2_true:.4f}")
# Full regression table (coefficients, std errors, t-stats, p-values, R2)
fit.summary()
# =============================================================================
# STEP 3: Scatter plot — population line vs. sample line
# =============================================================================
# Visualizing the gap between the true line and our estimate
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data['x'], data['y'], s=50, alpha=0.7, label='Observed data')
ax.plot(data['x'], fit.predict(), color='red', linewidth=2,
label=f'Sample: y_hat = {b1:.2f} + {b2:.2f}x')
x_range = np.linspace(data['x'].min(), data['x'].max(), 100)
ax.plot(x_range, beta_1_true + beta_2_true * x_range,
color='green', linewidth=2, linestyle='--',
label=f'Population: E[y|x] = {beta_1_true} + {beta_2_true}x')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Population Regression vs. Sample Regression')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 4: Monte Carlo simulation — demonstrate unbiasedness
# =============================================================================
# Draw many samples from the SAME DGP to see how b2 varies
# Unbiasedness: on average, b2 equals the true b2
n_simulations = 1000
b2_estimates = []
for i in range(n_simulations):
x_sim = np.random.normal(3, 1, n)
u_sim = np.random.normal(0, sigma_u, n)
y_sim = beta_1_true + beta_2_true * x_sim + u_sim
df_sim = pd.DataFrame({'x': x_sim, 'y': y_sim})
m = pf.feols('y ~ x', data=df_sim)
b2_estimates.append(m.coef()['x'])
print(f"\nMonte Carlo results ({n_simulations} simulations, n={n} each):")
print(f" True b2: {beta_2_true}")
print(f" Mean of b2 estimates: {np.mean(b2_estimates):.4f} (approx b2, confirming unbiasedness)")
print(f" Std dev of estimates: {np.std(b2_estimates):.4f} (empirical standard error)")
# =============================================================================
# STEP 5: Visualize the sampling distribution of b2
# =============================================================================
# The histogram should be centered on b2 (unbiasedness) and bell-shaped (CLT)
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(b2_estimates, bins=40, density=True, alpha=0.7, edgecolor='white',
label=f'{n_simulations} estimates of b2')
ax.axvline(beta_2_true, color='green', linewidth=2, linestyle='--',
label=f'True b2 = {beta_2_true}')
ax.axvline(np.mean(b2_estimates), color='red', linewidth=2,
label=f'Mean of estimates = {np.mean(b2_estimates):.4f}')
ax.set_xlabel('Slope estimate (b2)')
ax.set_ylabel('Density')
ax.set_title('Sampling Distribution of b2: Unbiasedness + CLT')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Standard error — what controls precision?
# =============================================================================
# se(b2) = s_e / sqrt[sum((xi - x_bar)^2)]
# Smaller when: (1) model fits well, (2) large n, (3) x spread wide
se_b2 = fit.se()['x'] # from regression output
s_e = np.sqrt(np.sum(fit._u_hat**2) / (n - 2)) # standard error of regression
x_variation = np.sum((data['x'] - data['x'].mean())**2)
print(f"\nStandard error anatomy (from the single-sample regression):")
print(f" s_e (root MSE): {s_e:.4f}")
print(f" sum((xi - x_bar)^2): {x_variation:.4f}")
print(f" se(b2) = s_e / sqrt(sum) = {s_e / np.sqrt(x_variation):.4f}")
print(f" se(b2) from output: {se_b2:.4f}")
# =============================================================================
# STEP 7: Effect of sample size on precision
# =============================================================================
# Theory: se(b2) proportional to 1/sqrt(n) — doubling n cuts SE by ~30%, quadrupling halves it
sample_sizes = [20, 50, 100, 200]
print(f"\n{'n':>6} {'Mean b2':>10} {'Std dev (empirical SE)':>22}")
print("-" * 42)
for ns in sample_sizes:
estimates = []
for _ in range(1000):
xs = np.random.normal(3, 1, ns)
us = np.random.normal(0, sigma_u, ns)
ys = beta_1_true + beta_2_true * xs + us
m = pf.feols('y ~ x', data=pd.DataFrame({'x': xs, 'y': ys}))
estimates.append(m.coef()['x'])
print(f"{ns:>6} {np.mean(estimates):>10.4f} {np.std(estimates):>22.4f}")
* =============================================================================
* CHAPTER 6 CHEAT SHEET: The Least Squares Estimator
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Define the Data-Generating Process (DGP)
* =============================================================================
* The DGP specifies the TRUE population relationship: y = b1 + b2*x + u
* We know the true parameters — in real research, we never do!
local beta_1_true = 1 // true intercept
local beta_2_true = 2 // true slope
local sigma_u = 2 // error standard deviation
* Generate one sample of n observations
set seed 42
set obs 30
gen x = rnormal(3, 1)
gen u = rnormal(0, `sigma_u')
gen y = `beta_1_true' + `beta_2_true' * x + u
display "Generated sample: " _N " observations from y = `beta_1_true' + `beta_2_true'x + u"
* =============================================================================
* STEP 2: Fit OLS and compare sample vs. population parameters
* =============================================================================
* The sample regression estimates the unknown population line from data
regress y x
// Display comparison of population vs sample parameters
display _newline "Population: E[y|x] = `beta_1_true' + `beta_2_true'x"
display "Sample: y_hat = " %6.2f _b[_cons] " + " %6.2f _b[x] "x"
display "Sampling error in slope: b2 - b2_true = " %8.4f (_b[x] - `beta_2_true')
* =============================================================================
* STEP 3: Scatter plot — population line vs. sample line
* =============================================================================
* Visualizing the gap between the true line and our estimate
predict yhat // fitted values from the regression
// Create the population (true) line for comparison
gen y_true = `beta_1_true' + `beta_2_true' * x
twoway (scatter y x, msize(small) msymbol(circle)) ///
(line yhat x, sort lcolor(red) lwidth(medthick) ///
legend(label(2 "Sample: y_hat"))) ///
(line y_true x, sort lcolor(green) lwidth(medthick) lpattern(dash) ///
legend(label(3 "Population: E[y|x]"))), ///
xtitle("x") ytitle("y") ///
title("Population Regression vs. Sample Regression") ///
legend(order(1 "Observed data" 2 "Sample line" 3 "Population line"))
* =============================================================================
* STEP 4: Monte Carlo simulation — demonstrate unbiasedness
* =============================================================================
* Draw many samples from the SAME DGP to see how b2 varies
* Unbiasedness: on average, b2 equals the true b2
local n_sims = 1000
local n_obs = 30
// Program to run a single simulation
capture program drop one_sim
program define one_sim, rclass
args beta1 beta2 sigma n
drop _all
set obs `n'
gen x = rnormal(3, 1)
gen u = rnormal(0, `sigma')
gen y = `beta1' + `beta2' * x + u
regress y x
return scalar b2 = _b[x]
end
// Run the simulation
simulate b2 = r(b2), reps(`n_sims') seed(42): ///
one_sim `beta_1_true' `beta_2_true' `sigma_u' `n_obs'
summarize b2
display _newline "Monte Carlo results (`n_sims' simulations, n=`n_obs' each):"
display " True b2: `beta_2_true'"
display " Mean of b2 estimates: " %8.4f r(mean) " (approx b2, confirming unbiasedness)"
display " Std dev of estimates: " %8.4f r(sd) " (empirical standard error)"
* =============================================================================
* STEP 5: Visualize the sampling distribution of b2
* =============================================================================
* The histogram should be centered on b2 (unbiasedness) and bell-shaped (CLT)
histogram b2, kdensity ///
xline(`beta_2_true', lcolor(green) lwidth(medthick) lpattern(dash)) ///
xtitle("Slope estimate (b2)") ///
ytitle("Density") ///
title("Sampling Distribution of b2: Unbiasedness + CLT") ///
note("Green line = true b2 = `beta_2_true'")
* =============================================================================
* STEP 6: Standard error — what controls precision?
* =============================================================================
* se(b2) = s_e / sqrt[sum((xi - x_bar)^2)]
* Smaller when: (1) model fits well, (2) large n, (3) x spread wide
// Reload original sample for SE anatomy
clear all
set seed 42
set obs 30
gen x = rnormal(3, 1)
gen u = rnormal(0, 2)
gen y = 1 + 2 * x + u
regress y x
local se_b2 = _se[x] // standard error of slope
local s_e = e(rmse) // root MSE (std error of regression)
// Compute sum of squared deviations of x
summarize x
local x_var = r(Var) * (r(N) - 1) // sum((xi - x_bar)^2)
display _newline "Standard error anatomy (from the single-sample regression):"
display " s_e (root MSE): " %8.4f `s_e'
display " sum((xi - x_bar)^2): " %8.4f `x_var'
display " se(b2) = s_e / sqrt(sum) = " %8.4f (`s_e' / sqrt(`x_var'))
display " se(b2) from output: " %8.4f `se_b2'
* =============================================================================
* STEP 7: Effect of sample size on precision
* =============================================================================
* Theory: se(b2) proportional to 1/sqrt(n)
* Doubling n cuts SE by ~30%, quadrupling halves it
display _newline " n Mean b2 Std dev (empirical SE)"
display "==========================================
foreach ns in 20 50 100 200 {
capture program drop sim_n
program define sim_n, rclass
args beta1 beta2 sigma n
drop _all
set obs `n'
gen x = rnormal(3, 1)
gen u = rnormal(0, `sigma')
gen y = `beta1' + `beta2' * x + u
regress y x
return scalar b2 = _b[x]
end
quietly simulate b2 = r(b2), reps(1000) seed(42): ///
sim_n 1 2 2 `ns'
quietly summarize b2
display %6.0f `ns' " " %10.4f r(mean) " " %22.4f r(sd)
}
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 6 CHEAT SHEET: The Least Squares Estimator
# =============================================================================
# --- Libraries ---
library(fixest) # fast OLS estimation with feols()
library(ggplot2) # grammar of graphics for all plots
library(dplyr) # data manipulation
# =============================================================================
# STEP 1: Define the Data-Generating Process (DGP)
# =============================================================================
# The DGP specifies the TRUE population relationship: y = b1 + b2*x + u
# We know the true parameters — in real research, we never do!
beta_1_true <- 1 # true intercept
beta_2_true <- 2 # true slope
sigma_u <- 2 # error standard deviation
# Generate one sample of n observations
set.seed(42)
n <- 30
x <- rnorm(n, mean = 3, sd = 1)
u <- rnorm(n, mean = 0, sd = sigma_u)
y <- beta_1_true + beta_2_true * x + u
data <- data.frame(x = x, y = y)
cat("Generated sample:", n, "observations from y =", beta_1_true, "+", beta_2_true, "x + u\n")
# =============================================================================
# STEP 2: Fit OLS and compare sample vs. population parameters
# =============================================================================
model <- feols(y ~ x, data = data)
summary(model)
b1 <- coef(model)["(Intercept)"]
b2 <- coef(model)["x"]
cat("\nPopulation: E[y|x] =", beta_1_true, "+", beta_2_true, "x\n")
cat("Sample: y_hat =", round(b1, 2), "+", round(b2, 2), "x\n")
cat("Sampling error in slope: b2 - b2_true =", round(b2 - beta_2_true, 4), "\n")
# =============================================================================
# STEP 3: Scatter plot — population line vs. sample line
# =============================================================================
ggplot(data, aes(x = x, y = y)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE,
color = "red", linewidth = 1.2) +
geom_abline(intercept = beta_1_true, slope = beta_2_true,
color = "green", linewidth = 1.2, linetype = "dashed") +
labs(x = "x", y = "y",
title = "Population Regression vs. Sample Regression") +
theme_minimal()
# =============================================================================
# STEP 4: Monte Carlo simulation — demonstrate unbiasedness
# =============================================================================
# Draw many samples from the SAME DGP to see how b2 varies
set.seed(42)
n_sims <- 1000
# replicate() runs the simulation n_sims times
b2_estimates <- replicate(n_sims, {
x_sim <- rnorm(n, mean = 3, sd = 1)
u_sim <- rnorm(n, mean = 0, sd = sigma_u)
y_sim <- beta_1_true + beta_2_true * x_sim + u_sim
coef(feols(y_sim ~ x_sim))["x_sim"]
})
cat("\nMonte Carlo results (", n_sims, "simulations, n =", n, "each):\n")
cat(" True b2: ", beta_2_true, "\n")
cat(" Mean of b2 estimates:", round(mean(b2_estimates), 4),
" (confirming unbiasedness)\n")
cat(" Std dev of estimates:", round(sd(b2_estimates), 4),
" (empirical standard error)\n")
# =============================================================================
# STEP 5: Visualize the sampling distribution of b2
# =============================================================================
ggplot(data.frame(b2 = b2_estimates), aes(x = b2)) +
geom_histogram(bins = 40, fill = "steelblue", color = "white", alpha = 0.7) +
geom_vline(xintercept = beta_2_true, color = "green",
linewidth = 1.5, linetype = "dashed") +
geom_vline(xintercept = mean(b2_estimates), color = "red", linewidth = 1.2) +
labs(x = "Slope estimate (b2)", y = "Frequency",
title = "Sampling Distribution of b2: Unbiasedness + CLT") +
theme_minimal()
# =============================================================================
# STEP 6: Standard error — what controls precision?
# =============================================================================
# se(b2) = s_e / sqrt[sum((xi - x_bar)^2)]
se_b2 <- se(model)["x"]
s_e <- sigma(model) # residual standard error (root MSE)
x_variation <- sum((data$x - mean(data$x))^2)
cat("\nStandard error anatomy:\n")
cat(" s_e (root MSE): ", round(s_e, 4), "\n")
cat(" sum((xi - x_bar)^2): ", round(x_variation, 4), "\n")
cat(" se(b2) = s_e / sqrt(sum):", round(s_e / sqrt(x_variation), 4), "\n")
cat(" se(b2) from output: ", round(se_b2, 4), "\n")
# =============================================================================
# STEP 7: Effect of sample size on precision
# =============================================================================
# Theory: se(b2) proportional to 1/sqrt(n)
cat("\n n Mean b2 Std dev (empirical SE)\n")
for (ns in c(20, 50, 100, 200)) {
ests <- replicate(1000, {
xs <- rnorm(ns, 3, 1); us <- rnorm(ns, 0, sigma_u)
ys <- beta_1_true + beta_2_true * xs + us
coef(feols(ys ~ xs))["xs"]
})
cat(sprintf("%5d %10.4f %22.4f\n", ns, mean(ests), sd(ests)))
}
Paste into your R console or RStudio