Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 14 CHEAT SHEET: Regression with Indicator Variables
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast OLS estimation with feols()
# !pip install pyfixest # uncomment if running in Google Colab
from scipy import stats # t-tests for group comparisons
from scipy.stats import f_oneway # one-way ANOVA F-test
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files — 872 full-time workers aged 25-65
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA"
data = pd.read_stata(url)
print(f"Dataset: {data.shape[0]} observations, {data.shape[1]} variables")
# =============================================================================
# STEP 2: Descriptive statistics — compare earnings by gender
# =============================================================================
# Indicator variable: gender = 1 (female), gender = 0 (male)
mean_male = data[data['gender'] == 0]['earnings'].mean()
mean_female = data[data['gender'] == 1]['earnings'].mean()
diff_means = mean_female - mean_male
print(f"Mean earnings (Male): ${mean_male:,.2f}")
print(f"Mean earnings (Female): ${mean_female:,.2f}")
print(f"Difference (F - M): ${diff_means:,.2f}")
# =============================================================================
# STEP 3: Regression on a single indicator — equivalent to difference in means
# =============================================================================
# The intercept = mean for d=0 (males); the gender coefficient = mean difference
# IMPORTANT: vcov='HC1' uses robust standard errors
fit1 = pf.feols('earnings ~ gender', data=data, vcov='HC1')
intercept = fit1.coef()['Intercept'] # mean earnings for males
gap = fit1.coef()['gender'] # earnings gap (females - males)
r2 = fit1._r2
print(f"\nModel 1: earnings = {intercept:,.0f} + ({gap:,.0f}) × gender")
print(f"Raw gender gap: ${gap:,.0f} (females earn ${abs(gap):,.0f} less)")
print(f"R-squared: {r2:.4f} ({r2*100:.1f}% of variation explained)")
fit1.summary()
# =============================================================================
# STEP 4: Add controls and interaction — how the gap changes
# =============================================================================
# Adding education as a control measures the gap AFTER accounting for education
fit2 = pf.feols('earnings ~ gender + education', data=data, vcov='HC1')
# Adding gender×education interaction allows returns to education to differ by gender
fit3 = pf.feols('earnings ~ gender + education + genderbyeduc', data=data, vcov='HC1')
# Full model with additional controls
fit4 = pf.feols('earnings ~ gender + education + genderbyeduc + age + hours',
data=data, vcov='HC1')
# Compare how the gender coefficient evolves across models
print(f"{'Model':<12} {'Gender Coef':>14} {'R²':>8}")
print("-" * 36)
for name, f in [('Gender only', fit1), ('+ Education', fit2),
('+ Interact', fit3), ('+ Age,Hours', fit4)]:
g = f.coef()['gender']
print(f"{name:<12} {g:>14,.0f} {f._r2:>8.4f}")
# =============================================================================
# STEP 5: Scatter plot — visualize separate regression lines by gender
# =============================================================================
# Non-parallel lines indicate different slopes = interaction term is needed
fig, ax = plt.subplots(figsize=(10, 6))
for g, label, color in [(0, 'Male', 'tab:blue'), (1, 'Female', 'tab:red')]:
subset = data[data['gender'] == g]
ax.scatter(subset['education'], subset['earnings'], alpha=0.3, s=25,
label=label, color=color)
# Fit and plot regression line for each group
z = np.polyfit(subset['education'], subset['earnings'], 1)
edu_range = np.linspace(subset['education'].min(), subset['education'].max(), 100)
ax.plot(edu_range, np.poly1d(z)(edu_range), linewidth=2, color=color,
label=f'{label} slope: ${z[0]:,.0f}/yr')
ax.set_xlabel('Years of Education')
ax.set_ylabel('Earnings ($)')
ax.set_title('Earnings vs Education by Gender (non-parallel = interaction needed)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Sets of indicators — worker type and the dummy variable trap
# =============================================================================
# Three mutually exclusive categories: dself, dprivate, dgovt (sum to 1)
# Drop one (dprivate = base) to avoid perfect multicollinearity
fit_worker = pf.feols('earnings ~ dself + dgovt + education + age',
data=data, vcov='HC1')
print(f"Base category: Private sector")
print(f"Self-employed vs Private: ${fit_worker.coef()['dself']:,.0f}")
print(f"Government vs Private: ${fit_worker.coef()['dgovt']:,.0f}")
print(f"R-squared: {fit_worker._r2:.4f}")
fit_worker.summary()
# =============================================================================
# STEP 7: ANOVA — test if earnings differ across worker types
# =============================================================================
# Regression on mutually exclusive indicators = analysis of variance (ANOVA)
group_self = data[data['dself'] == 1]['earnings']
group_priv = data[data['dprivate'] == 1]['earnings']
group_govt = data[data['dgovt'] == 1]['earnings']
f_stat, p_value = f_oneway(group_self, group_priv, group_govt)
print(f"\nANOVA F-statistic: {f_stat:.2f}, p-value: {p_value:.4f}")
# Group means with counts
data['worker_type'] = np.where(data['dself'] == 1, 'Self-employed',
np.where(data['dprivate'] == 1, 'Private', 'Government'))
print(data.groupby('worker_type')['earnings'].agg(['mean', 'count']).round(0))
* =============================================================================
* CHAPTER 14 CHEAT SHEET: Regression with Indicator Variables
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
* 872 full-time workers aged 25-65
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Descriptive statistics — compare earnings by gender
* =============================================================================
* Indicator variable: gender = 1 (female), gender = 0 (male)
* tabulate with summarize gives group means in one command
tabulate gender, summarize(earnings)
// Alternatively, compare means explicitly
summarize earnings if gender == 0 // male mean
local mean_male = r(mean)
summarize earnings if gender == 1 // female mean
local mean_female = r(mean)
display "Difference (F - M): " `mean_female' - `mean_male'
* =============================================================================
* STEP 3: Regression on a single indicator — equivalent to difference in means
* =============================================================================
* The intercept = mean for d=0 (males); the gender coefficient = mean difference
* IMPORTANT: vce(robust) uses heteroscedasticity-robust standard errors (HC1)
regress earnings gender, vce(robust)
// After regress, Stata stores coefficients in e(b) and R-squared in e(r2)
display "Raw gender gap: " _b[gender]
display "R-squared: " e(r2)
* =============================================================================
* STEP 4: Add controls and interaction — how the gap changes
* =============================================================================
* Adding education as a control measures the gap AFTER accounting for education
regress earnings gender education, vce(robust)
estimates store model2 // store results for later comparison
* Adding gender×education interaction allows returns to education to differ
* c.education#i.gender creates the interaction term
regress earnings gender education genderbyeduc, vce(robust)
estimates store model3
* Full model with additional controls
regress earnings gender education genderbyeduc age hours, vce(robust)
estimates store model4
// Compare how the gender coefficient evolves across models
// esttab produces a side-by-side comparison table (requires estout package)
// ssc install estout // install once if needed
// esttab model2 model3 model4, se r2
* =============================================================================
* STEP 5: Scatter plot — visualize separate regression lines by gender
* =============================================================================
* Non-parallel lines indicate different slopes = interaction term is needed
* twoway plots scatter points with linear fits for each group
twoway (scatter earnings education if gender == 0, ///
mcolor(blue%30) msize(small)) ///
(scatter earnings education if gender == 1, ///
mcolor(red%30) msize(small)) ///
(lfit earnings education if gender == 0, ///
lcolor(blue) lwidth(medthick)) ///
(lfit earnings education if gender == 1, ///
lcolor(red) lwidth(medthick)), ///
xtitle("Years of Education") ///
ytitle("Earnings ($)") ///
title("Earnings vs Education by Gender") ///
legend(order(1 "Male" 2 "Female" 3 "Male fit" 4 "Female fit"))
* =============================================================================
* STEP 6: Sets of indicators — worker type and the dummy variable trap
* =============================================================================
* Three mutually exclusive categories: dself, dprivate, dgovt (sum to 1)
* Drop one (dprivate = base) to avoid perfect multicollinearity
regress earnings dself dgovt education age, vce(robust)
display "Base category: Private sector"
display "Self-employed vs Private: " _b[dself]
display "Government vs Private: " _b[dgovt]
display "R-squared: " e(r2)
// Change the base category to see how coefficients shift
// but predicted means stay the same
regress earnings dprivate dgovt education age, vce(robust)
display "Base category: Self-employed"
* =============================================================================
* STEP 7: ANOVA — test if earnings differ across worker types
* =============================================================================
* Regression on mutually exclusive indicators = analysis of variance (ANOVA)
* oneway performs a one-way ANOVA F-test across groups
// Create a single categorical variable for worker type
gen worker_type = 1 if dself == 1
replace worker_type = 2 if dprivate == 1
replace worker_type = 3 if dgovt == 1
label define wtype 1 "Self-employed" 2 "Private" 3 "Government"
label values worker_type wtype
// One-way ANOVA: test if earnings differ across worker types
oneway earnings worker_type
// Group means with counts
tabulate worker_type, summarize(earnings)
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 14 CHEAT SHEET: Regression with Indicator Variables
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# 872 full-time workers aged 25-65
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA"
data <- read_dta(url)
cat("Dataset:", nrow(data), "observations,", ncol(data), "variables\n")
# =============================================================================
# STEP 2: Descriptive statistics — compare earnings by gender
# =============================================================================
# Indicator variable: gender = 1 (female), gender = 0 (male)
data |>
group_by(gender) |>
summarize(mean_earnings = mean(earnings), n = n())
# =============================================================================
# STEP 3: Regression on a single indicator — equivalent to difference in means
# =============================================================================
# The intercept = mean for d=0 (males); the gender coefficient = mean difference
model1 <- feols(earnings ~ gender, data = data, vcov = "HC1")
summary(model1)
cat("Raw gender gap: $", round(coef(model1)["gender"], 0), "\n")
# =============================================================================
# STEP 4: Add controls and interaction — how the gap changes
# =============================================================================
model2 <- feols(earnings ~ gender + education, data = data, vcov = "HC1")
model3 <- feols(earnings ~ gender + education + genderbyeduc,
data = data, vcov = "HC1")
model4 <- feols(earnings ~ gender + education + genderbyeduc + age + hours,
data = data, vcov = "HC1")
# Compare how the gender coefficient evolves across models
etable(model1, model2, model3, model4,
headers = c("Gender only", "+ Educ", "+ Interact", "+ Age,Hours"),
keep = "gender")
# =============================================================================
# STEP 5: Scatter plot — visualize separate regression lines by gender
# =============================================================================
ggplot(data, aes(x = education, y = earnings, color = factor(gender))) +
geom_point(alpha = 0.3, size = 1.5) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, linewidth = 1.2) +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
labels = c("Male", "Female")) +
labs(x = "Years of Education", y = "Earnings ($)",
title = "Earnings vs Education by Gender", color = NULL) +
theme_minimal()
# =============================================================================
# STEP 6: Sets of indicators — worker type and the dummy variable trap
# =============================================================================
# Three mutually exclusive categories: dself, dprivate, dgovt (sum to 1)
# Drop one (dprivate = base) to avoid perfect multicollinearity
model_worker <- feols(earnings ~ dself + dgovt + education + age,
data = data, vcov = "HC1")
summary(model_worker)
cat("Base category: Private sector\n")
cat("Self-employed vs Private: $", round(coef(model_worker)["dself"], 0), "\n")
cat("Government vs Private: $", round(coef(model_worker)["dgovt"], 0), "\n")
# =============================================================================
# STEP 7: ANOVA — test if earnings differ across worker types
# =============================================================================
data <- data |>
mutate(worker_type = case_when(
dself == 1 ~ "Self-employed",
dprivate == 1 ~ "Private",
dgovt == 1 ~ "Government"
))
# One-way ANOVA
aov_result <- aov(earnings ~ worker_type, data = data)
summary(aov_result)
# Group means
data |>
group_by(worker_type) |>
summarize(mean = round(mean(earnings), 0), n = n())
Paste into your R console or RStudio