Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 5 CHEAT SHEET: Bivariate Data Summary
# =============================================================================
# --- Libraries ---
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
import pyfixest as pf # OLS regression with R-style formulas
# !pip install pyfixest # uncomment if running in Google Colab
from statsmodels.nonparametric.smoothers_lowess import lowess # LOWESS nonparametric smoothing
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files — the dataset has 29 house sales
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house = pd.read_stata(url)
print(f"Dataset: {data_house.shape[0]} observations, {data_house.shape[1]} variables")
# =============================================================================
# STEP 2: Descriptive statistics — summarize each variable before comparing
# =============================================================================
# .describe() gives mean, std, min, quartiles, max for both variables
print(data_house[['price', 'size']].describe().round(2))
# =============================================================================
# STEP 3: Scatter plot — visualize the relationship before quantifying it
# =============================================================================
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_house['size'], data_house['price'], s=60, alpha=0.7)
ax.set_xlabel('House Size (square feet)')
ax.set_ylabel('House Sale Price (dollars)')
ax.set_title('House Price vs Size')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 4: Correlation coefficient — one number for direction and strength
# =============================================================================
# .corr() computes the Pearson correlation matrix; r is unit-free and symmetric
corr_matrix = data_house[['price', 'size']].corr()
r = corr_matrix.loc['price', 'size']
print(f"Correlation coefficient: r = {r:.4f}")
print(f"Strength: {'Strong' if abs(r) > 0.7 else 'Moderate' if abs(r) > 0.4 else 'Weak'}")
print(f"r² = {r**2:.4f} ({r**2*100:.1f}% of variation shared)")
# =============================================================================
# STEP 5: OLS regression — fit the best-fitting line
# =============================================================================
# Formula syntax: 'y ~ x' regresses y on x (intercept included automatically)
# pf.feols() estimates the model in one call (no separate .fit() step)
fit = pf.feols('price ~ size', data=data_house)
slope = fit.coef()['size'] # marginal effect: $/sq ft
intercept = fit.coef()['Intercept'] # predicted price when size = 0
r_squared = fit._r2 # proportion of variation explained
print(f"Estimated equation: price = {intercept:,.0f} + {slope:.2f} × size")
print(f"Interpretation: each additional sq ft is associated with ${slope:,.2f} higher price")
print(f"R-squared: {r_squared:.4f} ({r_squared*100:.1f}% of variation explained)")
# Full regression table (coefficients, std errors, t-stats, p-values, R²)
fit.summary()
# =============================================================================
# STEP 6: Scatter plot with fitted line and R² — visualize model fit
# =============================================================================
# fit.predict() returns the predicted y-values from the estimated equation
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_house['size'], data_house['price'], s=60, alpha=0.7, label='Actual prices')
ax.plot(data_house['size'], fit.predict(), color='red', linewidth=2, label='Fitted line')
ax.set_xlabel('House Size (square feet)')
ax.set_ylabel('House Sale Price (dollars)')
ax.set_title(f'OLS: price = {intercept:,.0f} + {slope:.2f} × size (R² = {r_squared:.2%})')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 7: Reverse regression — association is NOT causation
# =============================================================================
# If regression = causation, the reverse slope would be 1/slope. It is not.
reverse_fit = pf.feols('size ~ price', data=data_house)
print(f"price ~ size slope: {slope:.4f}")
print(f"size ~ price slope: {reverse_fit.coef()['price']:.6f}")
print(f"1 / original slope: {1/slope:.6f}")
print(f"Reciprocals match? {1/slope:.6f} ≠ {reverse_fit.coef()['price']:.6f}")
print("→ Regression is asymmetric: association, not causation!")
# =============================================================================
# STEP 8: Nonparametric regression — check the linearity assumption
# =============================================================================
# LOWESS fits weighted local regressions; if the curve tracks the OLS line,
# the linear assumption is validated for this dataset
lowess_result = lowess(data_house['price'], data_house['size'], frac=0.6)
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(data_house['size'], data_house['price'], s=60, alpha=0.6, label='Actual data')
ax.plot(data_house['size'], fit.predict(), color='red',
linewidth=2, label='OLS (parametric)')
ax.plot(lowess_result[:, 0], lowess_result[:, 1], color='green',
linewidth=2, linestyle='--', label='LOWESS (nonparametric)')
ax.set_xlabel('House Size (square feet)')
ax.set_ylabel('House Sale Price (dollars)')
ax.set_title('Parametric vs Nonparametric Regression')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 5 CHEAT SHEET: Bivariate Data Summary
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Descriptive statistics — summarize each variable before comparing
* =============================================================================
* summarize gives n, mean, std, min, max; "detail" adds median and percentiles
summarize price size, detail
* =============================================================================
* STEP 3: Scatter plot — visualize the relationship before quantifying it
* =============================================================================
* twoway scatter draws a scatterplot of y against x
twoway scatter price size, ///
xtitle("House Size (square feet)") ///
ytitle("House Sale Price (dollars)") ///
title("House Price vs Size")
* =============================================================================
* STEP 4: Correlation coefficient — one number for direction and strength
* =============================================================================
* correlate computes the Pearson correlation matrix; r is unit-free and symmetric
correlate price size
// pwcorr shows pairwise correlations with significance levels
pwcorr price size, sig star(0.05)
* =============================================================================
* STEP 5: OLS regression — fit the best-fitting line
* =============================================================================
* regress fits an OLS model: regress y x (intercept included automatically)
* The coefficient on size is the marginal effect: dollars per sq ft
regress price size
// After regress, Stata stores results you can reference:
display "Slope (b): " _b[size] // marginal effect: $/sq ft
display "Intercept (a): " _b[_cons] // predicted price when size = 0
display "R-squared: " e(r2) // proportion of variation explained
* =============================================================================
* STEP 6: Scatter plot with fitted line and R² — visualize model fit
* =============================================================================
* predict creates a new variable with the predicted y-values from the model
predict price_hat // predicted values from the last regress
twoway (scatter price size, mcolor(%60)) ///
(line price_hat size, lcolor(red) lwidth(medthick) sort), ///
xtitle("House Size (square feet)") ///
ytitle("House Sale Price (dollars)") ///
title("OLS Regression: Fitted Line and R²") ///
legend(order(1 "Actual prices" 2 "Fitted line"))
* =============================================================================
* STEP 7: Reverse regression — association is NOT causation
* =============================================================================
* If regression = causation, the reverse slope would be 1/slope. It is not.
// Store the original slope for comparison
scalar orig_slope = _b[size]
regress size price
display "price ~ size slope: " orig_slope
display "size ~ price slope: " _b[price]
display "1 / original slope: " 1 / orig_slope
display "Reciprocals match? No — regression is asymmetric!"
* =============================================================================
* STEP 8: Nonparametric regression — check the linearity assumption
* =============================================================================
* lowess fits weighted local regressions; if the curve tracks the OLS line,
* the linear assumption is validated for this dataset
// Re-estimate OLS for the combined plot
regress price size
predict price_ols // OLS fitted values
// lowess draws the nonparametric fit directly
twoway (scatter price size, mcolor(%50)) ///
(line price_ols size, lcolor(red) lwidth(medthick) sort) ///
(lowess price size, lcolor(green) lwidth(medthick) lpattern(dash)), ///
xtitle("House Size (square feet)") ///
ytitle("House Sale Price (dollars)") ///
title("Parametric vs Nonparametric Regression") ///
legend(order(1 "Actual data" 2 "OLS (parametric)" 3 "LOWESS (nonparametric)"))
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 5 CHEAT SHEET: Bivariate Data Summary
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files directly from URLs
library(fixest) # fast OLS estimation with feols()
library(dplyr) # data manipulation (mutate, filter, summarize)
library(ggplot2) # grammar of graphics for all plots
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_HOUSE.DTA"
data_house <- read_dta(url)
cat("Dataset:", nrow(data_house), "observations,", ncol(data_house), "variables\n")
# =============================================================================
# STEP 2: Descriptive statistics — summarize each variable before comparing
# =============================================================================
summary(data_house[, c("price", "size")])
# =============================================================================
# STEP 3: Scatter plot — visualize the relationship before quantifying it
# =============================================================================
ggplot(data_house, aes(x = size, y = price)) +
geom_point(color = "steelblue", size = 3, alpha = 0.7) +
labs(x = "House Size (square feet)", y = "House Sale Price (dollars)",
title = "House Price vs Size") +
theme_minimal()
# =============================================================================
# STEP 4: Correlation coefficient — one number for direction and strength
# =============================================================================
# cor() computes the Pearson correlation; cor.test() adds a p-value
r <- cor(data_house$price, data_house$size)
cat("Correlation coefficient: r =", round(r, 4), "\n")
cat("r² =", round(r^2, 4), "(", round(r^2*100, 1), "% of variation shared)\n")
cor.test(data_house$price, data_house$size)
# =============================================================================
# STEP 5: OLS regression — fit the best-fitting line
# =============================================================================
# feols() from fixest: formula syntax y ~ x (intercept included automatically)
model <- feols(price ~ size, data = data_house)
summary(model)
slope <- coef(model)["size"]
intercept <- coef(model)["(Intercept)"]
r_squared <- r2(model)
cat("Estimated equation: price =", round(intercept, 0), "+",
round(slope, 2), "× size\n")
cat("R-squared:", round(r_squared, 4), "\n")
# =============================================================================
# STEP 6: Scatter plot with fitted line and R² — visualize model fit
# =============================================================================
ggplot(data_house, aes(x = size, y = price)) +
geom_point(color = "steelblue", size = 3, alpha = 0.7) +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
linewidth = 1.2, se = FALSE) +
labs(x = "House Size (square feet)", y = "House Sale Price (dollars)",
title = paste0("OLS Regression: Fitted Line (R² = ",
round(r_squared * 100, 1), "%)")) +
theme_minimal()
# =============================================================================
# STEP 7: Reverse regression — association is NOT causation
# =============================================================================
# If regression = causation, the reverse slope would be 1/slope. It is not.
reverse_model <- feols(size ~ price, data = data_house)
cat("price ~ size slope:", round(slope, 4), "\n")
cat("size ~ price slope:", round(coef(reverse_model)["price"], 6), "\n")
cat("1 / original slope: ", round(1/slope, 6), "\n")
cat("Reciprocals match? No — regression is asymmetric!\n")
# =============================================================================
# STEP 8: Nonparametric regression — check the linearity assumption
# =============================================================================
# LOESS fits weighted local regressions; if the curve tracks the OLS line,
# the linear assumption is validated for this dataset
ggplot(data_house, aes(x = size, y = price)) +
geom_point(color = "steelblue", size = 3, alpha = 0.6) +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
linewidth = 1.2, se = FALSE, aes(linetype = "OLS (parametric)")) +
geom_smooth(method = "loess", formula = y ~ x, color = "green",
linewidth = 1.2, se = FALSE, linetype = "dashed",
aes(linetype = "LOESS (nonparametric)")) +
labs(x = "House Size (square feet)", y = "House Sale Price (dollars)",
title = "Parametric vs Nonparametric Regression") +
theme_minimal()
Paste into your R console or RStudio