Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 9 CHEAT SHEET: Models with Natural Logarithms
# =============================================================================
# --- Libraries ---
import numpy as np # logarithms and exponentials
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # fast OLS estimation with feols()
# !pip install pyfixest # uncomment if running in Google Colab
# =============================================================================
# STEP 1: Load the earnings-education dataset
# =============================================================================
# pd.read_stata() reads Stata .dta files directly from a URL
url_earn = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA"
data_earnings = pd.read_stata(url_earn)
print(f"Dataset: {data_earnings.shape[0]} observations, {data_earnings.shape[1]} variables")
# =============================================================================
# STEP 2: Logarithmic approximation — why economists use logs
# =============================================================================
# Key property: Δln(x) ≈ Δx/x (proportionate change)
# Multiplying by 100 gives the percentage change
x0, x1 = 40, 40.4
exact = (x1 - x0) / x0
approx = np.log(x1) - np.log(x0)
print(f"Change from {x0} to {x1}:")
print(f" Exact proportionate change: {exact:.6f} ({exact*100:.2f}%)")
print(f" Log approximation Δln(x): {approx:.6f} ({approx*100:.2f}%)")
# =============================================================================
# STEP 3: Descriptive statistics and log transformations
# =============================================================================
# Create log-transformed variables for the regression models
data_earnings['lnearn'] = np.log(data_earnings['earnings'])
data_earnings['lneduc'] = np.log(data_earnings['education'])
print(data_earnings[['earnings', 'lnearn', 'education', 'lneduc']].describe().round(2))
# =============================================================================
# STEP 4: Estimate all four model specifications
# =============================================================================
# Each model answers a different economic question about earnings and education
# Model 1: Linear — Δy = β₁Δx (dollar change per year of education)
fit_linear = pf.feols('earnings ~ education', data=data_earnings)
# Model 2: Log-linear — 100β₁ ≈ % change in y per unit x (semi-elasticity)
fit_loglin = pf.feols('lnearn ~ education', data=data_earnings)
# Model 3: Log-log — β₁ ≈ % change in y per % change in x (elasticity)
fit_loglog = pf.feols('lnearn ~ lneduc', data=data_earnings)
# Model 4: Linear-log — β₁/100 ≈ dollar change per % change in x
fit_linlog = pf.feols('earnings ~ lneduc', data=data_earnings)
# Print the most important model: log-linear (semi-elasticity)
semi_elast = fit_loglin.coef()['education']
print(f"Log-linear: each year of education → {100*semi_elast:.1f}% higher earnings")
print(f"Log-log elasticity: {fit_loglog.coef()['lneduc']:.3f}")
# Full regression table for the log-linear model
fit_loglin.summary()
# =============================================================================
# STEP 5: Compare all four models side by side
# =============================================================================
# The comparison shows that model choice affects both R² and interpretation
models = {
'Linear': ('earnings ~ education', fit_linear, 'education', '${:,.0f} per year'),
'Log-linear': ('ln(y) ~ x', fit_loglin, 'education', '{:.1f}% per year'),
'Log-log': ('ln(y) ~ ln(x)', fit_loglog, 'lneduc', '{:.2f}% per 1%'),
'Linear-log': ('y ~ ln(x)', fit_linlog, 'lneduc', '${:,.0f} per 1%'),
}
print(f"{'Model':<12} {'Specification':<16} {'Slope':>10} {'R²':>8} Interpretation")
print("-" * 75)
for name, (spec, f, var, fmt_str) in models.items():
slope = f.coef()[var]
interp = fmt_str.format(100*slope if 'per year' in fmt_str and 'Log' in name else slope/100 if 'per 1%' in fmt_str and name == 'Linear-log' else slope)
print(f"{name:<12} {spec:<16} {slope:>10.4f} {f._r2:>8.3f} {interp}")
# =============================================================================
# STEP 6: Scatter plot with the log-linear fitted line
# =============================================================================
# The log-linear model (semi-elasticity) provides the best fit for earnings data
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_earnings['education'], data_earnings['lnearn'], s=50, alpha=0.7)
ax.plot(data_earnings['education'], fit_loglin.predict(),
color='red', linewidth=2, label='Fitted line')
ax.set_xlabel('Education (years)')
ax.set_ylabel('ln(Earnings)')
ax.set_title(f'Log-Linear Model: semi-elasticity = {semi_elast:.4f} (R² = {fit_loglin._r2:.3f})')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 7: Exponential growth — S&P 500 and the Rule of 72
# =============================================================================
# Exponential growth in levels becomes linear in logs:
# ln(x_t) ≈ ln(x₀) + r × t, where slope r = annual growth rate
url_sp500 = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_SP500INDEX.DTA"
data_sp500 = pd.read_stata(url_sp500)
fit_sp500 = pf.feols('lnsp500 ~ year', data=data_sp500)
growth_rate = fit_sp500.coef()['year']
print(f"S&P 500 estimated growth rate: {100*growth_rate:.2f}% per year")
print(f"Rule of 72: doubles every {72/(100*growth_rate):.1f} years")
print(f"R-squared: {fit_sp500._r2:.4f}")
# Visualize: exponential in levels vs. linear in logs
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(data_sp500['year'], data_sp500['sp500'], linewidth=2)
axes[0].set_xlabel('Year')
axes[0].set_ylabel('S&P 500 Index')
axes[0].set_title('Exponential Growth in Levels')
axes[0].grid(True, alpha=0.3)
axes[1].plot(data_sp500['year'], data_sp500['lnsp500'], linewidth=2)
axes[1].plot(data_sp500['year'], fit_sp500.predict(),
color='red', linewidth=2, linestyle='--', label='Fitted (linear)')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('ln(S&P 500 Index)')
axes[1].set_title(f'Linear in Logs: growth = {100*growth_rate:.2f}%/year')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 9 CHEAT SHEET: Models with Natural Logarithms
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load the earnings-education dataset
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Logarithmic approximation — why economists use logs
* =============================================================================
* Key property: Δln(x) ≈ Δx/x (proportionate change)
* Multiplying by 100 gives the percentage change
local x0 = 40
local x1 = 40.4
local exact = (`x1' - `x0') / `x0'
local approx = ln(`x1') - ln(`x0')
display "Change from `x0' to `x1':"
display " Exact proportionate change: " %9.6f `exact' " (" %5.2f `exact'*100 "%)"
display " Log approximation Δln(x): " %9.6f `approx' " (" %5.2f `approx'*100 "%)"
* =============================================================================
* STEP 3: Descriptive statistics and log transformations
* =============================================================================
* gen creates a new variable; ln() is the natural log function
* Log-transformed variables are needed for log-linear and log-log models
gen lnearn = ln(earnings)
gen lneduc = ln(education)
summarize earnings lnearn education lneduc, detail
* =============================================================================
* STEP 4: Estimate all four model specifications
* =============================================================================
* Each model answers a different economic question about earnings and education
* Model 1: Linear — Δy = β₁Δx (dollar change per year of education)
regress earnings education
estimates store m_linear
* Model 2: Log-linear — 100β₁ ≈ % change in y per unit x (semi-elasticity)
regress lnearn education
estimates store m_loglin
// Store the semi-elasticity coefficient for later display
local semi_elast = _b[education]
display "Log-linear: each year of education → " %4.1f 100*`semi_elast' "% higher earnings"
* Model 3: Log-log — β₁ ≈ % change in y per % change in x (elasticity)
regress lnearn lneduc
estimates store m_loglog
display "Log-log elasticity: " %6.3f _b[lneduc]
* Model 4: Linear-log — β₁/100 ≈ dollar change per % change in x
regress earnings lneduc
estimates store m_linlog
* =============================================================================
* STEP 5: Compare all four models side by side
* =============================================================================
* estimates table displays coefficients and R² for all stored models
* The comparison shows that model choice affects both R² and interpretation
estimates table m_linear m_loglin m_loglog m_linlog, ///
stats(r2 N) b(%9.4f)
* Alternatively, use esttab for publication-quality output (requires ssc install estout)
// esttab m_linear m_loglin m_loglog m_linlog, ///
// r2 se label mtitle("Linear" "Log-linear" "Log-log" "Linear-log")
* =============================================================================
* STEP 6: Scatter plot with the log-linear fitted line
* =============================================================================
* The log-linear model (semi-elasticity) provides the best fit for earnings data
// Re-estimate the log-linear model to get fitted values
regress lnearn education
predict yhat_loglin // fitted values from the regression
twoway (scatter lnearn education, msize(small) mcolor(gs10)) ///
(line yhat_loglin education, sort lwidth(medthick) lcolor(red)), ///
xtitle("Education (years)") ///
ytitle("ln(Earnings)") ///
title("Log-Linear Model: ln(Earnings) on Education") ///
legend(order(1 "Observed" 2 "Fitted line"))
* =============================================================================
* STEP 7: Exponential growth — S&P 500 and the Rule of 72
* =============================================================================
* Exponential growth in levels becomes linear in logs:
* ln(x_t) ≈ ln(x₀) + r × t, where slope r = annual growth rate
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_SP500INDEX.DTA", clear
regress lnsp500 year
local growth = _b[year]
display "S&P 500 estimated growth rate: " %5.2f 100*`growth' "% per year"
display "Rule of 72: doubles every " %3.1f 72/(100*`growth') " years"
display "R-squared: " %6.4f e(r2)
// Fitted values for the log-scale plot
predict yhat_sp500
* Visualize: exponential in levels vs. linear in logs (side by side)
twoway (line sp500 year, lwidth(medthick)), ///
xtitle("Year") ytitle("S&P 500 Index") ///
title("Exponential Growth in Levels") ///
name(levels, replace)
twoway (line lnsp500 year, lwidth(medthick) lcolor(gs10)) ///
(line yhat_sp500 year, lwidth(medthick) lcolor(red) lpattern(dash)), ///
xtitle("Year") ytitle("ln(S&P 500 Index)") ///
title("Linear in Logs: Constant Growth Rate") ///
legend(order(1 "Observed" 2 "Fitted (linear)")) ///
name(logs, replace)
graph combine levels logs, title("S&P 500: Levels vs. Logs")
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 9 CHEAT SHEET: Models with Natural Logarithms
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation
library(ggplot2) # grammar of graphics
# =============================================================================
# STEP 1: Load the earnings-education dataset
# =============================================================================
url_earn <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA"
data_earnings <- read_dta(url_earn)
cat("Dataset:", nrow(data_earnings), "observations,", ncol(data_earnings), "variables\n")
# =============================================================================
# STEP 2: Logarithmic approximation — why economists use logs
# =============================================================================
# Key property: \u0394ln(x) \u2248 \u0394x/x (proportionate change)
x0 <- 40; x1 <- 40.4
exact <- (x1 - x0) / x0
approx <- log(x1) - log(x0)
cat("Change from", x0, "to", x1, ":\n")
cat(" Exact proportionate change:", round(exact, 6), "\n")
cat(" Log approximation \u0394ln(x): ", round(approx, 6), "\n")
# =============================================================================
# STEP 3: Descriptive statistics and log transformations
# =============================================================================
data_earnings <- data_earnings |>
mutate(lnearn = log(earnings),
lneduc = log(education))
summary(data_earnings[, c("earnings", "lnearn", "education", "lneduc")])
# =============================================================================
# STEP 4: Estimate all four model specifications
# =============================================================================
# Each model answers a different economic question about earnings and education
# Model 1: Linear — \u0394y = \u03b2\u2081\u0394x (dollar change per year of education)
m_linear <- feols(earnings ~ education, data = data_earnings)
# Model 2: Log-linear — 100\u03b2\u2081 \u2248 % change in y per unit x (semi-elasticity)
m_loglin <- feols(lnearn ~ education, data = data_earnings)
# Model 3: Log-log — \u03b2\u2081 \u2248 % change in y per % change in x (elasticity)
m_loglog <- feols(lnearn ~ lneduc, data = data_earnings)
# Model 4: Linear-log — \u03b2\u2081/100 \u2248 dollar change per % change in x
m_linlog <- feols(earnings ~ lneduc, data = data_earnings)
cat("Log-linear: each year of education \u2192",
round(100 * coef(m_loglin)["education"], 1), "% higher earnings\n")
cat("Log-log elasticity:", round(coef(m_loglog)["lneduc"], 3), "\n")
# etable() compares all models side by side (fixest's built-in table)
etable(m_linear, m_loglin, m_loglog, m_linlog,
headers = c("Linear", "Log-linear", "Log-log", "Linear-log"))
# =============================================================================
# STEP 5: Compare all four models side by side
# =============================================================================
cat("\nModel comparison:\n")
models <- list(Linear = m_linear, `Log-lin` = m_loglin,
`Log-log` = m_loglog, `Lin-log` = m_linlog)
for (nm in names(models)) {
m <- models[[nm]]
cat(sprintf("%-10s R\u00b2 = %.3f\n", nm, r2(m)))
}
# =============================================================================
# STEP 6: Scatter plot with the log-linear fitted line
# =============================================================================
ggplot(data_earnings, aes(x = education, y = lnearn)) +
geom_point(color = "gray50", size = 2, alpha = 0.6) +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
linewidth = 1.2, se = FALSE) +
labs(x = "Education (years)", y = "ln(Earnings)",
title = paste0("Log-Linear Model: semi-elasticity = ",
round(coef(m_loglin)["education"], 4))) +
theme_minimal()
# =============================================================================
# STEP 7: Exponential growth — S&P 500 and the Rule of 72
# =============================================================================
url_sp500 <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_SP500INDEX.DTA"
data_sp500 <- read_dta(url_sp500)
model_sp500 <- feols(lnsp500 ~ year, data = data_sp500)
growth_rate <- coef(model_sp500)["year"]
cat("\nS&P 500 estimated growth rate:", round(100 * growth_rate, 2), "% per year\n")
cat("Rule of 72: doubles every", round(72 / (100 * growth_rate), 1), "years\n")
library(patchwork)
p1 <- ggplot(data_sp500, aes(x = year, y = sp500)) +
geom_line(linewidth = 1) +
labs(x = "Year", y = "S&P 500 Index", title = "Exponential Growth in Levels") +
theme_minimal()
p2 <- ggplot(data_sp500, aes(x = year, y = lnsp500)) +
geom_line(linewidth = 1, color = "gray50") +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
linewidth = 1.2, se = FALSE, linetype = "dashed") +
labs(x = "Year", y = "ln(S&P 500 Index)",
title = paste0("Linear in Logs: growth = ",
round(100 * growth_rate, 2), "%/year")) +
theme_minimal()
p1 + p2
Paste into your R console or RStudio