Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 16 CHEAT SHEET: Checking the Model and Data
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast fixed-effects estimation
# !pip install pyfixest # uncomment if running in Google Colab
import statsmodels.api as sm # add_constant for VIF calculation
import statsmodels.formula.api as smf # OLS for diagnostics only
from statsmodels.stats.outliers_influence import ( # diagnostic tools:
variance_inflation_factor, OLSInfluence) # VIF and influence measures
from statsmodels.nonparametric.smoothers_lowess import lowess # LOWESS smooth for residual plots
# =============================================================================
# STEP 1: Load data
# =============================================================================
# Two datasets: earnings (cross-section) and democracy (cross-country)
url_base = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/"
data_earnings = pd.read_stata(url_base + "AED_EARNINGS_COMPLETE.DTA")
data_democracy = pd.read_stata(url_base + "AED_DEMOCRACY.DTA")
print(f"Earnings: {data_earnings.shape[0]} workers, {data_earnings.shape[1]} variables")
print(f"Democracy: {data_democracy.shape[0]} countries, {data_democracy.shape[1]} variables")
# =============================================================================
# STEP 2: Detect multicollinearity with VIF
# =============================================================================
# VIF_j = 1/(1 - R_j^2): measures how much SE inflates due to collinearity
# VIF > 10 = serious problem; VIF > 5 = investigate further
X_vif = data_earnings[['age', 'education', 'agebyeduc']].copy()
X_vif = sm.add_constant(X_vif)
vif_data = pd.DataFrame({
'Variable': X_vif.columns,
'VIF': [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
})
print("\nVariance Inflation Factors (with interaction term):")
print(vif_data.to_string(index=False))
# =============================================================================
# STEP 3: Compare standard vs robust standard errors
# =============================================================================
# Heteroskedasticity makes default SEs too small -> use HC1 (White) robust SEs
fit_std = pf.feols('earnings ~ age + education', data=data_earnings)
fit_robust = pf.feols('earnings ~ age + education', data=data_earnings, vcov='HC1')
se_comparison = pd.DataFrame({
'Variable': list(fit_std.coef().index),
'Standard SE': fit_std.se().values.round(2),
'Robust SE': fit_robust.se().values.round(2),
'Ratio': (fit_robust.se() / fit_std.se()).values.round(3)
})
print("\nSE Comparison (ratio > 1 signals heteroskedasticity):")
print(se_comparison.to_string(index=False))
# =============================================================================
# STEP 4: Omitted variable bias — democracy and growth
# =============================================================================
# Adding controls reveals how much the bivariate estimate was biased upward
fit_bivariate = pf.feols('democracy ~ growth', data=data_democracy, vcov='HC1')
fit_multiple = pf.feols('democracy ~ growth + constraint + indcent + catholic + muslim + protestant',
data=data_democracy, vcov='HC1')
b_biv = fit_bivariate.coef()['growth']
b_mult = fit_multiple.coef()['growth']
print(f"\nGrowth coefficient (bivariate): {b_biv:.4f}")
print(f"Growth coefficient (with controls): {b_mult:.4f}")
print(f"Reduction: {(1 - b_mult/b_biv)*100:.0f}% — institutional controls absorb the bias")
# =============================================================================
# STEP 5: Diagnostic plots — residual vs fitted
# =============================================================================
# Random scatter around zero = assumptions OK; fan shape = heteroskedasticity
uhat = fit_multiple._u_hat
yhat = fit_multiple.predict()
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Panel A: Actual vs Fitted
axes[0].scatter(yhat, data_democracy['democracy'], s=50, alpha=0.6)
axes[0].plot([yhat.min(), yhat.max()], [yhat.min(), yhat.max()],
'r-', linewidth=2, label='45° line')
axes[0].set_xlabel('Fitted Democracy')
axes[0].set_ylabel('Actual Democracy')
axes[0].set_title('Actual vs Fitted')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Panel B: Residual vs Fitted (with LOWESS smooth)
axes[1].scatter(yhat, uhat, s=50, alpha=0.6)
axes[1].axhline(y=0, color='gray', linewidth=1)
lw = lowess(uhat, yhat, frac=0.3) # LOWESS reveals hidden patterns
axes[1].plot(lw[:, 0], lw[:, 1], 'r--', linewidth=2, label='LOWESS smooth')
axes[1].set_xlabel('Fitted Democracy')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residual vs Fitted')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Influential observations — DFITS
# =============================================================================
# DFITS_i measures how much prediction i changes when observation i is excluded
# Threshold: |DFITS| > 2*sqrt(k/n)
# OLSInfluence requires a statsmodels fitted model (diagnostics only)
_diag_model = smf.ols('democracy ~ growth + constraint + indcent + catholic + muslim + protestant',
data=data_democracy).fit()
influence = OLSInfluence(_diag_model)
dfits = influence.dffits[0]
n = len(data_democracy)
k = len(fit_multiple.coef())
threshold = 2 * np.sqrt(k / n)
print(f"\nDFITS threshold: {threshold:.4f}")
print(f"Observations exceeding threshold: {np.sum(np.abs(dfits) > threshold)} out of {n}")
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['red' if abs(d) > threshold else 'steelblue' for d in dfits]
ax.scatter(range(n), dfits, c=colors, s=40, alpha=0.7)
ax.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold ±{threshold:.3f}')
ax.axhline(y=-threshold, color='red', linestyle='--')
ax.axhline(y=0, color='gray', linewidth=0.5)
ax.set_xlabel('Observation Index')
ax.set_ylabel('DFITS')
ax.set_title('Influential Observations (DFITS)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 16 CHEAT SHEET: Checking the Model and Data
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data
* =============================================================================
* Two datasets: earnings (cross-section) and democracy (cross-country)
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Detect multicollinearity with VIF
* =============================================================================
* VIF_j = 1/(1 - R_j^2): measures how much SE inflates due to collinearity
* VIF > 10 = serious problem; VIF > 5 = investigate further
// First run the regression that includes the interaction term
regress earnings age education agebyeduc
// estat vif computes VIF for each regressor after any regression
estat vif
// Interpretation: VIF near 1 = no collinearity; VIF > 10 = serious problem
// The interaction term (agebyeduc) will show a very high VIF because it is
// nearly a linear combination of age and education
* =============================================================================
* STEP 3: Compare standard vs robust standard errors
* =============================================================================
* Heteroskedasticity makes default SEs too small -> use robust (HC1) SEs
// Model with standard (homoskedastic) SEs
regress earnings age education
estimates store model_std
// Same model with robust (White/HC1) SEs
regress earnings age education, vce(robust)
estimates store model_robust
// Compare both sets of SEs side by side
estimates table model_std model_robust, se stats(N r2)
// Ratio > 1 signals heteroskedasticity: robust SEs are larger than standard SEs
* =============================================================================
* STEP 4: Omitted variable bias — democracy and growth
* =============================================================================
* Adding controls reveals how much the bivariate estimate was biased upward
// Switch to the democracy dataset
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_DEMOCRACY.DTA", clear
describe
// Bivariate regression: democracy on growth alone
regress democracy growth, vce(robust)
estimates store bivariate
scalar b_biv = _b[growth]
// Multiple regression: add institutional and religious controls
regress democracy growth constraint indcent catholic muslim protestant, vce(robust)
estimates store multiple
scalar b_mult = _b[growth]
// Display the reduction in the growth coefficient
display "Growth coefficient (bivariate): " b_biv
display "Growth coefficient (with controls): " b_mult
display "Reduction: " (1 - b_mult/b_biv)*100 "% — institutional controls absorb the bias"
// Compare both models side by side
estimates table bivariate multiple, se stats(N r2)
* =============================================================================
* STEP 5: Diagnostic plots — residual vs fitted
* =============================================================================
* Random scatter around zero = assumptions OK; fan shape = heteroskedasticity
// Re-run the multiple regression (needed for predict)
quietly regress democracy growth constraint indcent catholic muslim protestant, vce(robust)
// Generate fitted values and residuals
predict yhat, xb // fitted values (X*beta)
predict uhat, residuals // OLS residuals (y - X*beta)
// Panel A: Actual vs Fitted with 45-degree line
twoway (scatter democracy yhat, msymbol(Oh) mcolor(navy)) ///
(line yhat yhat, lcolor(red) lwidth(medthick)), ///
xtitle("Fitted Democracy") ytitle("Actual Democracy") ///
title("Actual vs Fitted") ///
legend(order(1 "Observations" 2 "45° line"))
// Panel B: Residual vs Fitted with LOWESS smooth
twoway (scatter uhat yhat, msymbol(Oh) mcolor(navy)) ///
(lowess uhat yhat, lcolor(red) lwidth(medthick) lpattern(dash)), ///
xtitle("Fitted Democracy") ytitle("Residual") ///
title("Residual vs Fitted") ///
yline(0, lcolor(gs10) lwidth(thin)) ///
legend(order(1 "Residuals" 2 "LOWESS smooth"))
* =============================================================================
* STEP 6: Influential observations — DFITS
* =============================================================================
* DFITS_i measures how much prediction i changes when observation i is excluded
* Threshold: |DFITS| > 2*sqrt(k/n)
// predict with dfits option generates DFITS after a regression
predict dfits_val, dfits
// Compute the threshold
local k = e(df_m) + 1 // number of parameters (including intercept)
local n = e(N) // number of observations
local threshold = 2 * sqrt(`k' / `n')
display "DFITS threshold: " `threshold'
// Count observations exceeding the threshold
count if abs(dfits_val) > `threshold'
display "Observations exceeding threshold: " r(N) " out of " `n'
// Plot DFITS with threshold lines
gen obs_id = _n // observation index for x-axis
twoway (scatter dfits_val obs_id if abs(dfits_val) <= `threshold', ///
msymbol(Oh) mcolor(navy) msize(small)) ///
(scatter dfits_val obs_id if abs(dfits_val) > `threshold', ///
msymbol(Oh) mcolor(red) msize(small)), ///
yline(`threshold', lcolor(red) lpattern(dash)) ///
yline(-`threshold', lcolor(red) lpattern(dash)) ///
yline(0, lcolor(gs10) lwidth(thin)) ///
xtitle("Observation Index") ytitle("DFITS") ///
title("Influential Observations (DFITS)") ///
legend(order(1 "Below threshold" 2 "Above threshold"))
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 16 CHEAT SHEET: Checking the Model and Data
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
library(car) # vif() for multicollinearity detection
# =============================================================================
# STEP 1: Load data
# =============================================================================
url_base <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/"
data_earnings <- read_dta(paste0(url_base, "AED_EARNINGS_COMPLETE.DTA"))
data_democracy <- read_dta(paste0(url_base, "AED_DEMOCRACY.DTA"))
cat("Earnings: ", nrow(data_earnings), "workers\n")
cat("Democracy:", nrow(data_democracy), "countries\n")
# =============================================================================
# STEP 2: Detect multicollinearity with VIF
# =============================================================================
# VIF_j = 1/(1 - R_j^2); VIF > 10 = serious; VIF > 5 = investigate
model_vif <- lm(earnings ~ age + education + agebyeduc, data = data_earnings)
vif(model_vif)
# =============================================================================
# STEP 3: Compare standard vs robust standard errors
# =============================================================================
model_std <- feols(earnings ~ age + education, data = data_earnings)
model_robust <- feols(earnings ~ age + education, data = data_earnings, vcov = "HC1")
etable(model_std, model_robust, headers = c("Standard", "Robust (HC1)"))
# =============================================================================
# STEP 4: Omitted variable bias — democracy and growth
# =============================================================================
m_biv <- feols(democracy ~ growth, data = data_democracy, vcov = "HC1")
m_mult <- feols(democracy ~ growth + constraint + indcent + catholic + muslim + protestant,
data = data_democracy, vcov = "HC1")
cat("\nGrowth coefficient (bivariate): ", round(coef(m_biv)["growth"], 4), "\n")
cat("Growth coefficient (with controls):", round(coef(m_mult)["growth"], 4), "\n")
cat("Reduction:", round((1 - coef(m_mult)["growth"]/coef(m_biv)["growth"])*100, 0),
"% — institutional controls absorb the bias\n")
etable(m_biv, m_mult, headers = c("Bivariate", "With controls"))
# =============================================================================
# STEP 5: Diagnostic plots — residual vs fitted
# =============================================================================
data_democracy$yhat <- fitted(m_mult)
data_democracy$resid <- residuals(m_mult)
library(patchwork)
p1 <- ggplot(data_democracy, aes(x = yhat, y = democracy)) +
geom_point(alpha = 0.6) +
geom_abline(color = "red", linetype = "dashed", linewidth = 1.2) +
labs(x = "Fitted Democracy", y = "Actual Democracy",
title = "Actual vs Fitted") +
theme_minimal()
p2 <- ggplot(data_democracy, aes(x = yhat, y = resid)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 0, color = "gray") +
geom_smooth(method = "loess", formula = y ~ x, color = "red",
linetype = "dashed", se = FALSE) +
labs(x = "Fitted Democracy", y = "Residual",
title = "Residual vs Fitted") +
theme_minimal()
p1 + p2
# =============================================================================
# STEP 6: Influential observations — DFITS
# =============================================================================
# DFITS_i measures how much prediction i changes when obs i is excluded
model_lm <- lm(democracy ~ growth + constraint + indcent + catholic + muslim + protestant,
data = data_democracy)
dfits_val <- dffits(model_lm)
n <- nrow(data_democracy)
k <- length(coef(model_lm))
threshold <- 2 * sqrt(k / n)
cat("\nDFITS threshold:", round(threshold, 4), "\n")
cat("Observations exceeding threshold:", sum(abs(dfits_val) > threshold), "out of", n, "\n")
ggplot(data.frame(obs = 1:n, dfits = dfits_val), aes(x = obs, y = dfits)) +
geom_point(aes(color = abs(dfits) > threshold), size = 2, alpha = 0.7) +
scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "red"), guide = "none") +
geom_hline(yintercept = c(-threshold, threshold), color = "red", linetype = "dashed") +
geom_hline(yintercept = 0, color = "gray", linewidth = 0.5) +
labs(x = "Observation Index", y = "DFITS",
title = "Influential Observations (DFITS)") +
theme_minimal()
Paste into your R console or RStudio