Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 17 CHEAT SHEET: Panel Data, Time Series Data, Causation
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast fixed-effects estimation
# !pip install pyfixest # uncomment if running in Google Colab
from statsmodels.tsa.stattools import acf # autocorrelation function
# =============================================================================
# STEP 1: Load panel data (NBA teams across seasons)
# =============================================================================
# Panel data: multiple individuals (teams) observed over multiple time periods
url_nba = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_NBA.DTA"
data_nba = pd.read_stata(url_nba)
print(f"Panel: {data_nba['teamid'].nunique()} teams × {data_nba['season'].nunique()} seasons = {len(data_nba)} obs")
# =============================================================================
# STEP 2: Variance decomposition — between vs within variation
# =============================================================================
# Understanding which variation your estimator uses is the first step in panel analysis
overall_sd = data_nba['lnrevenue'].std()
between_sd = data_nba.groupby('teamid')['lnrevenue'].mean().std()
within_sd = data_nba.groupby('teamid')['lnrevenue'].apply(lambda x: x - x.mean()).std()
print(f"\nVariance Decomposition of Log Revenue:")
print(f" Overall SD: {overall_sd:.4f}")
print(f" Between SD: {between_sd:.4f} (across teams)")
print(f" Within SD: {within_sd:.4f} (over time)")
print(f" Between > Within → team characteristics dominate year-to-year swings")
# =============================================================================
# STEP 3: Pooled OLS with cluster-robust SEs
# =============================================================================
# Observations within the same team are correlated over time — default SEs
# dramatically understate uncertainty. Always cluster by individual in panel data.
fit_pool = pf.feols('lnrevenue ~ wins', data=data_nba)
fit_cluster = pf.feols('lnrevenue ~ wins', data=data_nba, vcov={'CRV1': 'teamid'})
print(f"\nPooled OLS — wins coefficient: {fit_pool.coef()['wins']:.6f}")
print(f" Default SE: {fit_pool.se()['wins']:.6f}")
print(f" Cluster SE: {fit_cluster.se()['wins']:.6f}")
print(f" Ratio: {fit_cluster.se()['wins'] / fit_pool.se()['wins']:.2f}x larger")
# =============================================================================
# STEP 4: Fixed effects — control for unobserved team characteristics
# =============================================================================
# FE uses only within-team variation (de-meaning), eliminating bias from
# persistent traits like market size, brand value, and arena quality.
fit_fe = pf.feols('lnrevenue ~ wins | teamid', data=data_nba, vcov={'CRV1': 'teamid'})
print(f"\nFixed Effects — wins coefficient: {fit_fe.coef()['wins']:.6f}")
print(f" Cluster SE: {fit_fe.se()['wins']:.6f}")
print(f" R² (within): {fit_fe._r2_within:.4f}")
print(f"\nComparison:")
print(f" Pooled OLS coef: {fit_pool.coef()['wins']:.6f}")
print(f" Fixed Effects: {fit_fe.coef()['wins']:.6f}")
print(f" FE is smaller → pooled OLS had positive omitted variable bias")
# =============================================================================
# STEP 5: Time series — levels vs first differences
# =============================================================================
# Non-stationary (trending) series produce spurious regressions with misleading R².
# First differencing removes trends and restores valid inference.
url_rates = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_INTERESTRATES.DTA"
data_rates = pd.read_stata(url_rates)
# Regression in levels (potentially spurious)
fit_levels = pf.feols('gs10 ~ gs1', data=data_rates)
# Regression in first differences (removes trends)
fit_changes = pf.feols('dgs10 ~ dgs1', data=data_rates)
print(f"\nLevels regression: gs1 coef = {fit_levels.coef()['gs1']:.4f}, R² = {fit_levels._r2:.4f}")
print(f"Changes regression: dgs1 coef = {fit_changes.coef()['dgs1']:.4f}, R² = {fit_changes._r2:.4f}")
print(f"R² drops after differencing — lower but honest (no spurious trend inflation)")
# =============================================================================
# STEP 6: Autocorrelation diagnostics — the smoking gun
# =============================================================================
# Slowly decaying ACF in residuals signals non-stationarity and invalid SEs.
# After differencing, autocorrelation should drop dramatically.
acf_levels = acf(fit_levels._u_hat.dropna(), nlags=5)
acf_changes = acf(fit_changes._u_hat.dropna(), nlags=5)
print(f"\nResidual autocorrelation (lag 1):")
print(f" Levels regression: {acf_levels[1]:.4f} (high → non-stationary residuals)")
print(f" Changes regression: {acf_changes[1]:.4f} (much lower → differencing worked)")
# HAC (Newey-West) SEs correct for autocorrelation without differencing
data_rates['_time'] = range(len(data_rates))
fit_hac = pf.feols('gs10 ~ gs1', data=data_rates, vcov='NW',
vcov_kwargs={'time_id': '_time', 'lag': 24})
print(f"\nDefault SE on gs1: {fit_levels.se()['gs1']:.4f}")
print(f"HAC SE on gs1: {fit_hac.se()['gs1']:.4f}")
print(f"HAC is {fit_hac.se()['gs1'] / fit_levels.se()['gs1']:.1f}x larger — default SEs are too small")
# =============================================================================
# STEP 7: ADL model — dynamic multipliers
# =============================================================================
# Autoregressive distributed lag models capture how effects build over time.
# Lagged dependent and independent variables model persistence and transmission.
data_rates['dgs10_lag1'] = data_rates['dgs10'].shift(1)
data_rates['dgs10_lag2'] = data_rates['dgs10'].shift(2)
data_rates['dgs1_lag1'] = data_rates['dgs1'].shift(1)
data_rates['dgs1_lag2'] = data_rates['dgs1'].shift(2)
fit_adl = pf.feols('dgs10 ~ dgs10_lag1 + dgs10_lag2 + dgs1 + dgs1_lag1 + dgs1_lag2',
data=data_rates)
print(f"\nADL(2,2) Model:")
print(f" Impact multiplier (dgs1): {fit_adl.coef()['dgs1']:.4f}")
print(f" 1-month cumulative: {fit_adl.coef()['dgs1'] + fit_adl.coef()['dgs1_lag1']:.4f}")
print(f" 2-month cumulative: {fit_adl.coef()['dgs1'] + fit_adl.coef()['dgs1_lag1'] + fit_adl.coef()['dgs1_lag2']:.4f}")
print(f" R²: {fit_adl._r2:.4f} (much higher than static model)")
# Check residual autocorrelation — should be near zero if well-specified
acf_adl = acf(fit_adl._u_hat.dropna(), nlags=5)
print(f" Residual ACF(1): {acf_adl[1]:.4f} (near zero → dynamics captured)")
* =============================================================================
* CHAPTER 17 CHEAT SHEET: Panel Data, Time Series Data, Causation
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load panel data (NBA teams across seasons)
* =============================================================================
* Panel data: multiple individuals (teams) observed over multiple time periods
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_NBA.DTA", clear
describe
display "Panel: " ///
string(r(N)) " observations" // total team-season observations
codebook teamid, compact // how many unique teams
codebook season, compact // how many unique seasons
* =============================================================================
* STEP 2: Variance decomposition — between vs within variation
* =============================================================================
* Understanding which variation your estimator uses is the first step
* in panel analysis. xtsum decomposes each variable into overall,
* between (across teams), and within (over time) components.
xtset teamid season // declare panel structure
xtsum lnrevenue wins
// xtsum output shows three rows per variable:
// overall = total SD across all observations
// between = SD of team means
// within = SD of deviations from team means
* =============================================================================
* STEP 3: Pooled OLS with cluster-robust SEs
* =============================================================================
* Observations within the same team are correlated over time — default SEs
* dramatically understate uncertainty. Always cluster by individual in panel data.
// Default SEs (treats all 286 obs as independent — wrong for panels)
regress lnrevenue wins
estimates store pooled_default
// Robust SEs (corrects heteroskedasticity but not serial correlation)
regress lnrevenue wins, vce(robust)
estimates store pooled_robust
// Cluster SEs (corrects both heteroskedasticity and within-team correlation)
regress lnrevenue wins, vce(cluster teamid)
estimates store pooled_cluster
// Compare all three side by side
estimates table pooled_default pooled_robust pooled_cluster, se
* =============================================================================
* STEP 4: Fixed effects — control for unobserved team characteristics
* =============================================================================
* FE uses only within-team variation (de-meaning), eliminating bias from
* persistent traits like market size, brand value, and arena quality.
// xtreg with fe option estimates fixed effects (entity-demeaned regression)
// vce(cluster teamid) clusters SEs by team
xtreg lnrevenue wins, fe vce(cluster teamid)
estimates store fe_model
// Compare pooled OLS vs fixed effects
// After xtreg, _b[wins] holds the FE coefficient
display "Fixed Effects coef: " _b[wins]
estimates restore pooled_cluster
display "Pooled OLS coef: " _b[wins]
display "FE is smaller → pooled OLS had positive omitted variable bias"
// Report within-R² (shown automatically by xtreg, fe)
estimates restore fe_model
display "R² (within): " e(r2_w)
* =============================================================================
* STEP 5: Time series — levels vs first differences
* =============================================================================
* Non-stationary (trending) series produce spurious regressions with
* misleading R². First differencing removes trends and restores valid inference.
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_INTERESTRATES.DTA", clear
// Declare time-series structure
tsset daten
// Regression in levels (potentially spurious)
regress gs10 gs1
display "Levels R²: " e(r2)
// Regression in first differences (removes trends)
// D.gs10 and D.gs1 are Stata's time-series difference operators
regress D.gs10 D.gs1
display "Changes R²: " e(r2)
display "R² drops after differencing — lower but honest"
* =============================================================================
* STEP 6: Autocorrelation diagnostics — the smoking gun
* =============================================================================
* Slowly decaying ACF in residuals signals non-stationarity and invalid SEs.
* After differencing, autocorrelation should drop dramatically.
// Levels regression residuals
quietly regress gs10 gs1
predict resid_levels, residuals
corrgram resid_levels, lags(5) // correlogram shows ACF at each lag
display "High lag-1 ACF → non-stationary residuals"
// Changes regression residuals
quietly regress D.gs10 D.gs1
predict resid_changes, residuals
corrgram resid_changes, lags(5)
display "Lower lag-1 ACF → differencing worked"
// HAC (Newey-West) SEs correct for autocorrelation without differencing
// newey estimates OLS with Newey-West SEs; lag(24) sets bandwidth
newey gs10 gs1, lag(24)
display "HAC SE on gs1: " _se[gs1]
// Compare to default SE
quietly regress gs10 gs1
display "Default SE: " _se[gs1]
display "HAC is larger — default SEs are too small with autocorrelation"
* =============================================================================
* STEP 7: ADL model — dynamic multipliers
* =============================================================================
* Autoregressive distributed lag models capture how effects build over time.
* Lagged dependent and independent variables model persistence and transmission.
// Generate lagged differences using time-series operators
// L.D.gs10 = first lag of differenced gs10; L2.D.gs10 = second lag
regress D.gs10 L.D.gs10 L2.D.gs10 D.gs1 L.D.gs1 L2.D.gs1
display "ADL(2,2) Model:"
display " Impact multiplier (D.gs1): " _b[D.gs1]
display " 1-month cumulative: " _b[D.gs1] + _b[L.D.gs1]
display " 2-month cumulative: " _b[D.gs1] + _b[L.D.gs1] + _b[L2.D.gs1]
display " R²: " e(r2)
// Check residual autocorrelation — should be near zero if well-specified
predict resid_adl, residuals
corrgram resid_adl, lags(5)
display "Near-zero lag-1 ACF → dynamics captured"
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 17 CHEAT SHEET: Panel Data, Time Series Data, Causation
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS, FE, and HAC estimation
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load panel data (NBA teams across seasons)
# =============================================================================
url_nba <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_NBA.DTA"
data_nba <- read_dta(url_nba)
cat("Panel:", n_distinct(data_nba$teamid), "teams ×",
n_distinct(data_nba$season), "seasons =", nrow(data_nba), "obs\n")
# =============================================================================
# STEP 2: Variance decomposition — between vs within variation
# =============================================================================
overall_sd <- sd(data_nba$lnrevenue)
between_sd <- data_nba |> group_by(teamid) |>
summarize(m = mean(lnrevenue)) |> pull(m) |> sd()
within_sd <- data_nba |> group_by(teamid) |>
mutate(demeaned = lnrevenue - mean(lnrevenue)) |>
pull(demeaned) |> sd()
cat("\nVariance Decomposition of Log Revenue:\n")
cat(" Overall SD: ", round(overall_sd, 4), "\n")
cat(" Between SD: ", round(between_sd, 4), "(across teams)\n")
cat(" Within SD: ", round(within_sd, 4), "(over time)\n")
# =============================================================================
# STEP 3: Pooled OLS with cluster-robust SEs
# =============================================================================
# Observations within the same team are correlated — always cluster in panels
model_pool <- feols(lnrevenue ~ wins, data = data_nba)
model_cluster <- feols(lnrevenue ~ wins, data = data_nba, vcov = ~teamid)
cat("\nPooled OLS — wins coefficient:", round(coef(model_pool)["wins"], 6), "\n")
cat(" Default SE: ", round(se(model_pool)["wins"], 6), "\n")
cat(" Cluster SE: ", round(se(model_cluster)["wins"], 6), "\n")
cat(" Ratio: ", round(se(model_cluster)["wins"] / se(model_pool)["wins"], 2), "x larger\n")
# =============================================================================
# STEP 4: Fixed effects — control for unobserved team characteristics
# =============================================================================
# FE uses only within-team variation (de-meaning), eliminating bias from
# persistent traits like market size, brand value, and arena quality.
# fixest syntax: y ~ x | entity (the | separates FE from regressors)
model_fe <- feols(lnrevenue ~ wins | teamid, data = data_nba, vcov = ~teamid)
summary(model_fe)
cat("\nComparison:\n")
cat(" Pooled OLS coef:", round(coef(model_pool)["wins"], 6), "\n")
cat(" Fixed Effects: ", round(coef(model_fe)["wins"], 6), "\n")
cat(" FE is smaller → pooled OLS had positive omitted variable bias\n")
# etable() compares models side by side
etable(model_cluster, model_fe, headers = c("Pooled (cluster)", "FE (cluster)"))
# =============================================================================
# STEP 5: Time series — levels vs first differences
# =============================================================================
url_rates <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_INTERESTRATES.DTA"
data_rates <- read_dta(url_rates)
# Regression in levels (potentially spurious)
model_levels <- feols(gs10 ~ gs1, data = data_rates)
# Regression in first differences (removes trends)
model_changes <- feols(dgs10 ~ dgs1, data = data_rates)
cat("\nLevels regression: R² =", round(r2(model_levels), 4), "\n")
cat("Changes regression: R² =", round(r2(model_changes), 4), "\n")
cat("R² drops after differencing — lower but honest\n")
# =============================================================================
# STEP 6: Autocorrelation diagnostics — the smoking gun
# =============================================================================
acf_levels <- acf(residuals(model_levels), lag.max = 5, plot = FALSE)$acf[2]
acf_changes <- acf(na.omit(residuals(model_changes)), lag.max = 5, plot = FALSE)$acf[2]
cat("\nResidual autocorrelation (lag 1):\n")
cat(" Levels: ", round(acf_levels, 4), "(high → non-stationary)\n")
cat(" Changes: ", round(acf_changes, 4), "(much lower → differencing worked)\n")
# HAC (Newey-West) SEs correct for autocorrelation
model_hac <- feols(gs10 ~ gs1, data = data_rates, vcov = NW(24) ~ daten)
cat("\nDefault SE on gs1:", round(se(model_levels)["gs1"], 4), "\n")
cat("HAC SE on gs1: ", round(se(model_hac)["gs1"], 4), "\n")
cat("HAC is", round(se(model_hac)["gs1"] / se(model_levels)["gs1"], 1),
"x larger\n")
# =============================================================================
# STEP 7: ADL model — dynamic multipliers
# =============================================================================
data_rates <- data_rates |>
mutate(dgs10_lag1 = lag(dgs10, 1), dgs10_lag2 = lag(dgs10, 2),
dgs1_lag1 = lag(dgs1, 1), dgs1_lag2 = lag(dgs1, 2))
model_adl <- feols(dgs10 ~ dgs10_lag1 + dgs10_lag2 + dgs1 + dgs1_lag1 + dgs1_lag2,
data = data_rates)
cat("\nADL(2,2) Model:\n")
cat(" Impact multiplier (dgs1): ", round(coef(model_adl)["dgs1"], 4), "\n")
cat(" 1-month cumulative: ",
round(coef(model_adl)["dgs1"] + coef(model_adl)["dgs1_lag1"], 4), "\n")
cat(" 2-month cumulative: ",
round(sum(coef(model_adl)[c("dgs1","dgs1_lag1","dgs1_lag2")]), 4), "\n")
cat(" R²:", round(r2(model_adl), 4), "\n")
# Residual ACF — near zero means dynamics are captured
acf_adl <- acf(na.omit(residuals(model_adl)), lag.max = 5, plot = FALSE)$acf[2]
cat(" Residual ACF(1):", round(acf_adl, 4), "(near zero → dynamics captured)\n")
Paste into your R console or RStudio