Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 15 CHEAT SHEET: Regression with Transformed Variables
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations (log, exp, sqrt)
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # OLS regression with R-style formulas
# !pip install pyfixest # uncomment if running in Google Colab
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# 872 full-time workers aged 25-65 with earnings, education, age, and hours
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA"
data_earnings = pd.read_stata(url)
# Create log and squared variables for transformations
data_earnings['lnage'] = np.log(data_earnings['age'])
print(f"Dataset: {data_earnings.shape[0]} observations, {data_earnings.shape[1]} variables")
print(data_earnings[['earnings', 'lnearnings', 'age', 'education']].describe().round(2))
# =============================================================================
# STEP 2: Log transformations — levels vs log-linear vs log-log
# =============================================================================
# Three specifications of the same relationship reveal different stories
fit_levels = pf.feols('earnings ~ age + education', data=data_earnings, vcov='HC1')
fit_loglin = pf.feols('lnearnings ~ age + education', data=data_earnings, vcov='HC1')
fit_loglog = pf.feols('lnearnings ~ lnage + education', data=data_earnings, vcov='HC1')
print("=== Levels: absolute dollar effects ===")
print(f" Education: +${fit_levels.coef()['education']:,.0f} per year")
print("\n=== Log-Linear: semi-elasticity (% change per unit) ===")
print(f" Education: +{100*fit_loglin.coef()['education']:.1f}% per year (Mincer return)")
print("\n=== Log-Log: elasticity (% change per % change) ===")
print(f" Age elasticity: {fit_loglog.coef()['lnage']:.4f}")
# =============================================================================
# STEP 3: Quadratic model — turning point and varying marginal effects
# =============================================================================
# A quadratic in age captures the inverted-U life-cycle earnings profile
fit_quad = pf.feols('earnings ~ age + agesq + education', data=data_earnings, vcov='HC1')
bage = fit_quad.coef()['age']
bagesq = fit_quad.coef()['agesq']
turning_point = -bage / (2 * bagesq) # age where earnings peak
print(f"Turning point: {turning_point:.1f} years")
for a in [25, 40, 55]:
me = bage + 2 * bagesq * a # ME varies with age
print(f" ME at age {a}: ${me:,.0f}/year")
# Joint F-test: H0: age and age² are jointly zero
all_vars = list(fit_quad.coef().index)
R = np.zeros((2, len(all_vars)))
R[0, all_vars.index('age')] = 1
R[1, all_vars.index('agesq')] = 1
f_test = fit_quad.wald_test(R=R)
print(f"Joint F-test p-value: {f_test.pval.values[0]:.4f}")
# =============================================================================
# STEP 4: Standardized coefficients — compare variable importance
# =============================================================================
# Raw coefficients can't be compared across different units; beta* can
fit_mix = pf.feols('earnings ~ gender + age + agesq + education + dself + dgovt + lnhours',
data=data_earnings, vcov='HC1')
sd_y = data_earnings['earnings'].std()
predictors = ['gender', 'age', 'agesq', 'education', 'dself', 'dgovt', 'lnhours']
print(f"\n{'Variable':<12} {'Raw coef':>12} {'Beta*':>8}")
print("-" * 34)
for var in sorted(predictors, key=lambda v: abs(fit_mix.coef()[v] * data_earnings[v].std() / sd_y), reverse=True):
raw = fit_mix.coef()[var]
beta_star = raw * data_earnings[var].std() / sd_y
print(f"{var:<12} {raw:>12.2f} {beta_star:>8.4f}")
# =============================================================================
# STEP 5: Interaction terms — education returns that vary with age
# =============================================================================
# Does one more year of schooling pay the same at 25 as at 55?
fit_inter = pf.feols('earnings ~ age + education + agebyeduc', data=data_earnings, vcov='HC1')
b_educ = fit_inter.coef()['education']
b_inter = fit_inter.coef()['agebyeduc']
print(f"\nME of education = {b_educ:,.0f} + {b_inter:.1f} × age")
for a in [25, 40, 55]:
me = b_educ + b_inter * a # ME depends on age
print(f" At age {a}: ${me:,.0f} per year of education")
# =============================================================================
# STEP 6: Retransformation bias — naive exp() underpredicts
# =============================================================================
# Jensen's inequality: E[exp(u)] > exp(E[u]), so naive predictions are biased
rmse_log = np.sqrt(np.mean(fit_loglin._u_hat**2))
correction = np.exp(rmse_log**2 / 2) # normal-based smearing factor
naive_pred = np.exp(fit_loglin.predict())
adjusted_pred = correction * naive_pred
print(f"\nSmearing factor: {correction:.4f}")
print(f"Actual mean: ${data_earnings['earnings'].mean():,.0f}")
print(f"Naive mean: ${naive_pred.mean():,.0f} (underpredicts)")
print(f"Corrected mean: ${adjusted_pred.mean():,.0f} (bias removed)")
# =============================================================================
# STEP 7: Comprehensive model — combine all transformation types
# =============================================================================
# A single model mixing logs, quadratics, dummies, and continuous regressors
fit_full = pf.feols('lnearnings ~ gender + age + agesq + education + dself + dgovt + lnhours',
data=data_earnings, vcov='HC1')
print(f"\nR²: {fit_full._r2:.4f}")
print(f"Education return: ~{100*fit_full.coef()['education']:.1f}% per year (semi-elasticity)")
print(f"Gender gap: ~{100*fit_full.coef()['gender']:.1f}%")
print(f"Hours elasticity: {fit_full.coef()['lnhours']:.3f} (log-log coefficient)")
# Full regression table
fit_full.summary()
* =============================================================================
* CHAPTER 15 CHEAT SHEET: Regression with Transformed Variables
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
* 872 full-time workers aged 25-65 with earnings, education, age, and hours
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA", clear
* Create log and squared variables for transformations
gen lnage = ln(age)
describe
display "Observations: " _N
summarize earnings lnearnings age education, detail
* =============================================================================
* STEP 2: Log transformations — levels vs log-linear vs log-log
* =============================================================================
* Three specifications of the same relationship reveal different stories
// Levels model: absolute dollar effects
regress earnings age education, vce(robust)
display "Education: +$" _b[education] " per year"
// Log-linear model: semi-elasticity (% change per unit)
regress lnearnings age education, vce(robust)
display "Education: +" 100*_b[education] "% per year (Mincer return)"
// Log-log model: elasticity (% change per % change)
regress lnearnings lnage education, vce(robust)
display "Age elasticity: " _b[lnage]
* =============================================================================
* STEP 3: Quadratic model — turning point and varying marginal effects
* =============================================================================
* A quadratic in age captures the inverted-U life-cycle earnings profile
regress earnings age agesq education, vce(robust)
// Compute turning point: age where earnings peak
display "Turning point: " -_b[age] / (2 * _b[agesq]) " years"
// Marginal effect varies with age: ME = b_age + 2 * b_agesq * age
display "ME at age 25: $" _b[age] + 2 * _b[agesq] * 25 " /year"
display "ME at age 40: $" _b[age] + 2 * _b[agesq] * 40 " /year"
display "ME at age 55: $" _b[age] + 2 * _b[agesq] * 55 " /year"
// Joint F-test: H0: age and age² are jointly zero
test age agesq
* =============================================================================
* STEP 4: Standardized coefficients — compare variable importance
* =============================================================================
* Raw coefficients can't be compared across different units; beta* can
* Stata's "beta" option on regress reports standardized coefficients directly
// First estimate with robust SEs (for inference)
regress earnings gender age agesq education dself dgovt lnhours, vce(robust)
// Then re-estimate with "beta" to display standardized coefficients
// (beta option reports standardized coefficients alongside raw coefficients)
regress earnings gender age agesq education dself dgovt lnhours, beta
// The "Beta" column shows standardized coefficients — a one-SD change in x
// produces a Beta*-SD change in y, making coefficients comparable across units
* =============================================================================
* STEP 5: Interaction terms — education returns that vary with age
* =============================================================================
* Does one more year of schooling pay the same at 25 as at 55?
regress earnings age education agebyeduc, vce(robust)
// ME of education = b_education + b_agebyeduc * age
display "ME of education = " _b[education] " + " _b[agebyeduc] " x age"
display "At age 25: $" _b[education] + _b[agebyeduc] * 25 " per year of education"
display "At age 40: $" _b[education] + _b[agebyeduc] * 40 " per year of education"
display "At age 55: $" _b[education] + _b[agebyeduc] * 55 " per year of education"
* =============================================================================
* STEP 6: Retransformation bias — naive exp() underpredicts
* =============================================================================
* Jensen's inequality: E[exp(u)] > exp(E[u]), so naive predictions are biased
regress lnearnings age education, vce(robust)
// Compute naive prediction: exp(fitted values)
predict lnhat, xb // fitted values in log scale
gen naive_pred = exp(lnhat) // naive back-transformation
// Normal-based smearing factor: exp(sigma^2 / 2)
// e(rmse) is the root MSE from the last regression
display "RMSE (log scale): " e(rmse)
scalar smearing = exp(e(rmse)^2 / 2)
display "Smearing factor: " smearing
// Corrected prediction: multiply naive by smearing factor
gen corrected_pred = naive_pred * smearing
// Compare actual vs naive vs corrected means
// IMPORTANT: summarize stores r(mean) for the LAST variable listed,
// so we run each separately to display the correct mean
summarize earnings
display "Actual mean: $" r(mean)
summarize naive_pred
display "Naive mean: $" r(mean) " (underpredicts)"
summarize corrected_pred
display "Corrected mean: $" r(mean) " (bias removed)"
* =============================================================================
* STEP 7: Comprehensive model — combine all transformation types
* =============================================================================
* A single model mixing logs, quadratics, dummies, and continuous regressors
regress lnearnings gender age agesq education dself dgovt lnhours, vce(robust)
display "R-squared: " e(r2)
display "Education return: ~" 100*_b[education] "% per year (semi-elasticity)"
display "Gender gap: ~" 100*_b[gender] "%"
display "Hours elasticity: " _b[lnhours] " (log-log coefficient)"
// Full regression table is displayed automatically by regress
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 15 CHEAT SHEET: Regression with Transformed Variables
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS_COMPLETE.DTA"
data_earnings <- read_dta(url) |>
mutate(lnage = log(age))
cat("Dataset:", nrow(data_earnings), "observations\n")
# =============================================================================
# STEP 2: Log transformations — levels vs log-linear vs log-log
# =============================================================================
m_levels <- feols(earnings ~ age + education, data = data_earnings, vcov = "HC1")
m_loglin <- feols(lnearnings ~ age + education, data = data_earnings, vcov = "HC1")
m_loglog <- feols(lnearnings ~ lnage + education, data = data_earnings, vcov = "HC1")
cat("Levels: Education +$", round(coef(m_levels)["education"], 0), "/yr\n")
cat("Log-linear: Education +", round(100*coef(m_loglin)["education"], 1), "%/yr\n")
cat("Log-log: Age elasticity =", round(coef(m_loglog)["lnage"], 4), "\n")
# =============================================================================
# STEP 3: Quadratic model — turning point and varying marginal effects
# =============================================================================
m_quad <- feols(earnings ~ age + agesq + education, data = data_earnings, vcov = "HC1")
b_age <- coef(m_quad)["age"]
b_agesq <- coef(m_quad)["agesq"]
turning_point <- -b_age / (2 * b_agesq)
cat("Turning point:", round(turning_point, 1), "years\n")
for (a in c(25, 40, 55)) {
me <- b_age + 2 * b_agesq * a
cat(" ME at age", a, ": $", round(me, 0), "/year\n")
}
# Joint F-test: H0: age and age² are jointly zero
wald(m_quad, c("age", "agesq"))
# =============================================================================
# STEP 4: Standardized coefficients — compare variable importance
# =============================================================================
# Scale each variable to mean 0, SD 1, then re-estimate
vars_to_std <- c("earnings","gender","age","agesq","education","dself","dgovt","lnhours")
data_std <- data_earnings |>
mutate(across(all_of(vars_to_std), ~ as.numeric(scale(.))))
m_std <- feols(earnings ~ gender + age + agesq + education + dself + dgovt + lnhours,
data = data_std)
cat("\nStandardized coefficients (beta*):\n")
print(round(coef(m_std)[-1], 4))
# =============================================================================
# STEP 5: Interaction terms — education returns that vary with age
# =============================================================================
m_inter <- feols(earnings ~ age + education + agebyeduc,
data = data_earnings, vcov = "HC1")
b_educ <- coef(m_inter)["education"]
b_inter <- coef(m_inter)["agebyeduc"]
cat("\nME of education =", round(b_educ, 0), "+", round(b_inter, 1), "× age\n")
for (a in c(25, 40, 55)) {
me <- b_educ + b_inter * a
cat(" At age", a, ": $", round(me, 0), "per year of education\n")
}
# =============================================================================
# STEP 6: Retransformation bias — naive exp() underpredicts
# =============================================================================
# Jensen's inequality: E[exp(u)] > exp(E[u])
rmse_log <- sigma(lm(lnearnings ~ age + education, data = data_earnings))
correction <- exp(rmse_log^2 / 2) # normal-based smearing factor
naive_pred <- exp(fitted(m_loglin))
adjusted_pred <- correction * naive_pred
cat("\nSmearing factor:", round(correction, 4), "\n")
cat("Actual mean: $", round(mean(data_earnings$earnings), 0), "\n")
cat("Naive mean: $", round(mean(naive_pred), 0), " (underpredicts)\n")
cat("Corrected mean: $", round(mean(adjusted_pred), 0), " (bias removed)\n")
# =============================================================================
# STEP 7: Comprehensive model — combine all transformation types
# =============================================================================
m_full <- feols(lnearnings ~ gender + age + agesq + education + dself + dgovt + lnhours,
data = data_earnings, vcov = "HC1")
summary(m_full)
cat("R²:", round(r2(m_full), 4), "\n")
cat("Education return: ~", round(100*coef(m_full)["education"], 1), "%/yr\n")
cat("Gender gap: ~", round(100*coef(m_full)["gender"], 1), "%\n")
cat("Hours elasticity:", round(coef(m_full)["lnhours"], 3), "\n")
Paste into your R console or RStudio