Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 1 CHEAT SHEET: Analysis of Economics Data
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # OLS regression (Python port of R's fixest)
# !pip install pyfixest # uncomment if running in Google Colab
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files (pandas also supports CSV, Excel, etc.)
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house = pd.read_stata(url)
print(f"Dataset: {data_house.shape[0]} observations, {data_house.shape[1]} variables")
# =============================================================================
# STEP 2: Descriptive statistics — summarize before modeling
# =============================================================================
# .head() shows the first rows; .describe() gives mean, std, min, quartiles, max
print(data_house[['price', 'size']].describe().round(2))
# =============================================================================
# STEP 3: Scatter plot — always visualize before fitting a regression
# =============================================================================
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_house['size'], data_house['price'], s=50, alpha=0.7)
ax.set_xlabel('House Size (square feet)')
ax.set_ylabel('House Sale Price (dollars)')
ax.set_title('House Price vs Size')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 4: OLS regression — fit the model
# =============================================================================
# pf.feols() estimates OLS in one step — same formula syntax as R's fixest
# Formula: 'y ~ x' regresses y on x (intercept included automatically)
fit = pf.feols('price ~ size', data=data_house)
# Extract key results
slope = fit.coef()['size'] # marginal effect: $/sq ft
intercept = fit.coef()['Intercept'] # predicted price when size = 0
r_squared = fit._r2 # proportion of variation explained
print(f"Estimated equation: price = {intercept:,.0f} + {slope:.2f} × size")
print(f"Interpretation: each additional sq ft is associated with ${slope:,.2f} higher price")
print(f"R-squared: {r_squared:.4f} ({r_squared*100:.1f}% of variation explained)")
# Full regression table (coefficients, std errors, t-stats, p-values, R²)
fit.summary()
# =============================================================================
# STEP 5: Scatter plot with fitted regression line and R²
# =============================================================================
# fit.predict() returns the predicted y-values from the estimated equation
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_house['size'], data_house['price'], s=50, alpha=0.7, label='Actual prices')
ax.plot(data_house['size'], fit.predict(), color='red', linewidth=2, label='Fitted line')
ax.set_xlabel('House Size (square feet)')
ax.set_ylabel('House Sale Price (dollars)')
ax.set_title(f'OLS Regression: price = {intercept:,.0f} + {slope:.2f} × size (R² = {r_squared:.2%})')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Compare predictors — association is NOT causation
# =============================================================================
# Running separate regressions with different x-variables shows that each tells
# a different story. High R² does not prove causation — omitted variables
# (location, condition, school district) can bias any single-variable slope.
predictors = {
'size': 'Size (sq ft)',
'bedrooms': 'Bedrooms',
'bathrooms': 'Bathrooms',
'lotsize': 'Lot size',
'age': 'Age (years)',
}
print(f"{'Predictor':<18} {'Slope':>10} {'R²':>8}")
print("-" * 38)
for var, label in predictors.items():
m = pf.feols(f'price ~ {var}', data=data_house)
print(f"{label:<18} {m.coef()[var]:>10.2f} {m._r2:>8.4f}")
* =============================================================================
* CHAPTER 1 CHEAT SHEET: Analysis of Economics Data
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Descriptive statistics — summarize before modeling
* =============================================================================
* summarize gives n, mean, std, min, max; "detail" adds median and quartiles
summarize price size, detail
* =============================================================================
* STEP 3: Scatter plot — always visualize before fitting a regression
* =============================================================================
* scatter draws a scatter plot: first variable is y-axis, second is x-axis
scatter price size, ///
xtitle("House Size (square feet)") ///
ytitle("House Sale Price (dollars)") ///
title("House Price vs Size") ///
msymbol(circle) mcolor(blue%70)
* =============================================================================
* STEP 4: OLS regression — fit the model
* =============================================================================
* regress fits OLS: first variable is y, remaining are x's
* IMPORTANT: Stata automatically includes a constant (intercept)
regress price size
// After running regress, Stata stores results you can reference:
display "Slope (size): " _b[size] // marginal effect: $/sq ft
display "Intercept: " _b[_cons] // predicted price when size = 0
display "R-squared: " e(r2) // proportion of variation explained
display "Interpretation: each additional sq ft is associated with $" _b[size] " higher price"
* =============================================================================
* STEP 5: Scatter plot with fitted regression line and R²
* =============================================================================
* predict generates predicted values (fitted y-hat) after regress
predict price_hat // stores predicted values in price_hat
twoway (scatter price size, msymbol(circle) mcolor(blue%70)) ///
(line price_hat size, lcolor(red) lwidth(medthick) sort), ///
xtitle("House Size (square feet)") ///
ytitle("House Sale Price (dollars)") ///
title("OLS Regression: price on size") ///
legend(order(1 "Actual prices" 2 "Fitted line"))
* =============================================================================
* STEP 6: Compare predictors — association is NOT causation
* =============================================================================
* Running separate regressions with different x-variables shows that each tells
* a different story. High R² does not prove causation — omitted variables
* (location, condition, school district) can bias any single-variable slope.
// Size
regress price size
display "Size — Slope: " _b[size] " R²: " e(r2)
// Bedrooms
regress price bedrooms
display "Bedrooms — Slope: " _b[bedrooms] " R²: " e(r2)
// Bathrooms
regress price bathrooms
display "Bathrooms — Slope: " _b[bathrooms] " R²: " e(r2)
// Lot size
regress price lotsize
display "Lot size — Slope: " _b[lotsize] " R²: " e(r2)
// Age
regress price age
display "Age — Slope: " _b[age] " R²: " e(r2)
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 1 CHEAT SHEET: Analysis of Economics Data
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files directly from URLs
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation (mutate, filter, summarize)
library(ggplot2) # grammar of graphics for all plots
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# read_dta() reads Stata .dta files; works with local paths or URLs
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house <- read_dta(url)
cat("Dataset:", nrow(data_house), "observations,", ncol(data_house), "variables\n")
# =============================================================================
# STEP 2: Descriptive statistics — summarize before modeling
# =============================================================================
# summary() gives min, Q1, median, mean, Q3, max in one call
summary(data_house[, c("price", "size")])
# =============================================================================
# STEP 3: Scatter plot — always visualize before fitting a regression
# =============================================================================
ggplot(data_house, aes(x = size, y = price)) +
geom_point(color = "steelblue", size = 3, alpha = 0.7) +
labs(x = "House Size (square feet)", y = "House Sale Price (dollars)",
title = "House Price vs Size") +
theme_minimal()
# =============================================================================
# STEP 4: OLS regression — fit the model
# =============================================================================
# feols() from fixest estimates OLS; formula syntax: y ~ x
# IMPORTANT: fixest automatically includes an intercept
model <- feols(price ~ size, data = data_house)
# summary() shows coefficients, SEs, t-stats, p-values, and R²
summary(model)
# Extract key results
slope <- coef(model)["size"] # marginal effect: $/sq ft
intercept <- coef(model)["(Intercept)"] # predicted price when size = 0
r_squared <- r2(model) # proportion of variation explained
cat("Estimated equation: price =", round(intercept, 0), "+",
round(slope, 2), "x size\n")
cat("Interpretation: each additional sq ft is associated with $",
round(slope, 2), "higher price\n")
cat("R-squared:", round(r_squared, 4), "\n")
# =============================================================================
# STEP 5: Scatter plot with fitted regression line and R²
# =============================================================================
# fitted() extracts the predicted y-values from the estimated model
ggplot(data_house, aes(x = size, y = price)) +
geom_point(color = "steelblue", size = 3, alpha = 0.7) +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
linewidth = 1.2, se = FALSE) +
labs(x = "House Size (square feet)", y = "House Sale Price (dollars)",
title = paste0("OLS Regression: price on size (R² = ",
round(r_squared * 100, 1), "%)")) +
theme_minimal()
# =============================================================================
# STEP 6: Compare predictors — association is NOT causation
# =============================================================================
# Running separate regressions with different x-variables shows that each tells
# a different story. High R² does not prove causation — omitted variables
# (location, condition, school district) can bias any single-variable slope.
predictors <- c("size", "bedrooms", "bathrooms", "lotsize", "age")
results <- lapply(predictors, function(var) {
m <- feols(as.formula(paste("price ~", var)), data = data_house)
data.frame(Predictor = var, Slope = coef(m)[var], R2 = r2(m))
})
do.call(rbind, results)
Paste into your R console or RStudio