Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 2 CHEAT SHEET: Univariate Data Summary
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations (log, mean)
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
from scipy import stats # skewness, kurtosis, distribution shape
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files; this dataset has 171 observations
url_earnings = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA"
data_earnings = pd.read_stata(url_earnings)
earnings = data_earnings['earnings']
print(f"Dataset: {data_earnings.shape[0]} observations, {data_earnings.shape[1]} variables")
# =============================================================================
# STEP 2: Summary statistics — mean vs median reveals skewness
# =============================================================================
# .describe() gives count, mean, std, min, quartiles, max in one call
print(data_earnings[['earnings']].describe().round(2))
# Skewness and kurtosis measure the shape of the distribution
print(f"\nSkewness: {stats.skew(earnings):.2f} (> 1 = strongly right-skewed)")
print(f"Excess kurtosis: {stats.kurtosis(earnings):.2f} (> 0 = heavier tails than normal)")
print(f"Mean - Median: ${earnings.mean() - earnings.median():,.0f} (positive gap signals right skew)")
# =============================================================================
# STEP 3: Histogram with KDE overlay — see the distribution shape
# =============================================================================
# Bin width is a choice: narrower = more detail (and noise), wider = smoother
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(earnings, bins=20, edgecolor='black', alpha=0.7, density=True, label='Histogram')
earnings.plot.kde(ax=ax, linewidth=2, color='red', label='KDE')
ax.set_xlabel('Annual Earnings ($)')
ax.set_ylabel('Density')
ax.set_title('Earnings Distribution: Histogram + Kernel Density Estimate')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 4: Box plot — visualize quartiles and outliers
# =============================================================================
# The box spans Q1 to Q3 (IQR); whiskers extend 1.5×IQR; dots are outliers
fig, ax = plt.subplots(figsize=(10, 4))
ax.boxplot(earnings, vert=False, patch_artist=True,
boxprops=dict(facecolor='lightblue', alpha=0.7),
medianprops=dict(color='red', linewidth=2))
ax.set_xlabel('Annual Earnings ($)')
ax.set_title('Box Plot of Earnings — Median, Quartiles, and Outliers')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 5: Log transformation — taming right skew
# =============================================================================
# np.log() compresses big values and stretches small ones, making skewed
# distributions more symmetric — a prerequisite for many statistical methods
data_earnings['lnearnings'] = np.log(earnings)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(earnings, bins=20, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_title(f'Original (skewness = {stats.skew(earnings):.2f})')
axes[0].set_xlabel('Earnings ($)')
axes[1].hist(data_earnings['lnearnings'], bins=20, edgecolor='black', alpha=0.7, color='coral')
axes[1].set_title(f'Log-transformed (skewness = {stats.skew(data_earnings["lnearnings"]):.2f})')
axes[1].set_xlabel('ln(Earnings)')
plt.suptitle('Effect of Log Transformation on Skewness', fontweight='bold')
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Z-scores — how unusual is each observation?
# =============================================================================
# z = (x - mean) / std puts every value on a common "standard deviations
# from the mean" scale: |z| > 2 is unusual, |z| > 3 is very unusual
z_scores = (earnings - earnings.mean()) / earnings.std()
print(f"Highest earner: ${earnings.max():,.0f} → z = {z_scores.max():.2f}")
print(f"Median earner: ${earnings.median():,.0f} → z = {(earnings.median() - earnings.mean()) / earnings.std():.2f}")
print(f"Observations with |z| > 2: {(z_scores.abs() > 2).sum()} out of {len(z_scores)}")
# =============================================================================
# STEP 7: Time series — moving average smooths seasonal noise
# =============================================================================
# Monthly home sales zigzag with the seasons; an 11-month moving average
# cancels one full seasonal cycle, revealing the underlying trend
url_homesales = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_MONTHLYHOMESALES.DTA"
data_hs = pd.read_stata(url_homesales)
data_hs = data_hs[data_hs['year'] >= 2005]
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(data_hs['daten'], data_hs['exsales'], linewidth=1, alpha=0.6, label='Original (monthly)')
ax.plot(data_hs['daten'], data_hs['exsales_ma11'], linewidth=2, color='red',
linestyle='--', label='11-month Moving Average')
ax.set_xlabel('Year')
ax.set_ylabel('Monthly Home Sales')
ax.set_title('U.S. Home Sales: Raw Series vs. Moving Average (2005–2015)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
* =============================================================================
* CHAPTER 2 CHEAT SHEET: Univariate Data Summary
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Summary statistics — mean vs median reveals skewness
* =============================================================================
* summarize gives n, mean, std, min, max; "detail" adds median, skewness, kurtosis
summarize earnings, detail
// After running the command above, Stata stores results you can reference:
display "Skewness: " r(skewness) // > 1 = strongly right-skewed
display "Kurtosis: " r(kurtosis) // > 3 = heavier tails than normal
display "Mean - Median: " r(mean) - r(p50) // positive gap signals right skew
* =============================================================================
* STEP 3: Histogram with KDE overlay — see the distribution shape
* =============================================================================
* histogram draws a frequency or density histogram
* kdensity overlays a smooth kernel density estimate on the same plot
histogram earnings, kdensity ///
xtitle("Annual Earnings ($)") ///
ytitle("Density") ///
title("Earnings Distribution: Histogram + Kernel Density Estimate")
* =============================================================================
* STEP 4: Box plot — visualize quartiles and outliers
* =============================================================================
* graph box draws a box-and-whisker plot: box spans Q1 to Q3 (IQR),
* whiskers extend 1.5 x IQR, and dots mark outliers beyond that range
graph box earnings, ///
ytitle("Annual Earnings ($)") ///
title("Box Plot of Earnings — Median, Quartiles, and Outliers")
* =============================================================================
* STEP 5: Log transformation — taming right skew
* =============================================================================
* gen creates a new variable; ln() is the natural log function
* Log compresses big values and stretches small ones, making skewed
* distributions more symmetric — a prerequisite for many statistical methods
gen lnearnings = ln(earnings)
// Compare histograms side by side using graph combine
histogram earnings, name(raw, replace) title("Original") ///
xtitle("Earnings ($)")
histogram lnearnings, name(logged, replace) title("Log-transformed") ///
xtitle("ln(Earnings)")
graph combine raw logged, ///
title("Effect of Log Transformation on Skewness")
* =============================================================================
* STEP 6: Z-scores — how unusual is each observation?
* =============================================================================
* z = (x - mean) / std puts every value on a "standard deviations from the
* mean" scale: |z| > 2 is unusual, |z| > 3 is very unusual
// First compute mean and standard deviation, then generate z-scores
summarize earnings
gen z_earnings = (earnings - r(mean)) / r(sd)
// Inspect extreme values
summarize z_earnings, detail
display "Observations with |z| > 2:"
count if abs(z_earnings) > 2
* =============================================================================
* STEP 7: Time series — moving average smooths seasonal noise
* =============================================================================
* Monthly home sales zigzag with the seasons; an 11-month moving average
* cancels one full seasonal cycle, revealing the underlying trend
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_MONTHLYHOMESALES.DTA", clear
keep if year >= 2005
// tsset declares the time-series structure so Stata can use time operators
// tssmooth ma computes a centered moving average of the specified window
tsset daten
tssmooth ma exsales_smooth = exsales, window(5 1 5) // 11-month centered MA
twoway (line exsales daten, lwidth(thin) lcolor(gs10)) ///
(line exsales_smooth daten, lwidth(medthick) lcolor(red) lpattern(dash)), ///
xtitle("Year") ytitle("Monthly Home Sales") ///
title("U.S. Home Sales: Raw Series vs. Moving Average (2005–2015)") ///
legend(order(1 "Original (monthly)" 2 "11-month Moving Average"))
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 2 CHEAT SHEET: Univariate Data Summary
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files directly from URLs
library(dplyr) # data manipulation (mutate, filter, summarize)
library(ggplot2) # grammar of graphics for all plots
library(e1071) # skewness() and kurtosis() functions
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# read_dta() reads Stata .dta files; works with local paths or URLs
url_earnings <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA"
data_earnings <- read_dta(url_earnings)
earnings <- data_earnings$earnings
cat("Dataset:", nrow(data_earnings), "observations,", ncol(data_earnings), "variables\n")
# =============================================================================
# STEP 2: Summary statistics — mean vs median reveals skewness
# =============================================================================
# summary() gives min, Q1, median, mean, Q3, max in one call
summary(earnings)
# Skewness and kurtosis measure the shape of the distribution
cat("\nSkewness: ", round(skewness(earnings), 2),
" (> 1 = strongly right-skewed)\n")
cat("Excess kurtosis:", round(kurtosis(earnings), 2),
" (> 0 = heavier tails than normal)\n")
cat("Mean - Median: $", round(mean(earnings) - median(earnings), 0),
" (positive gap signals right skew)\n")
# =============================================================================
# STEP 3: Histogram with KDE overlay — see the distribution shape
# =============================================================================
# geom_histogram draws the bars; geom_density overlays the smooth KDE curve
# after_stat(density) rescales the histogram to match the density scale
ggplot(data_earnings, aes(x = earnings)) +
geom_histogram(aes(y = after_stat(density)), bins = 20,
fill = "steelblue", color = "black", alpha = 0.7) +
geom_density(color = "red", linewidth = 1.2) +
labs(x = "Annual Earnings ($)", y = "Density",
title = "Earnings Distribution: Histogram + Kernel Density Estimate") +
theme_minimal()
# =============================================================================
# STEP 4: Box plot — visualize quartiles and outliers
# =============================================================================
# The box spans Q1 to Q3 (IQR); whiskers extend 1.5x IQR; dots are outliers
ggplot(data_earnings, aes(y = earnings)) +
geom_boxplot(fill = "lightblue", alpha = 0.7, outlier.color = "red") +
coord_flip() +
labs(x = "", y = "Annual Earnings ($)",
title = "Box Plot of Earnings \u2014 Median, Quartiles, and Outliers") +
theme_minimal()
# =============================================================================
# STEP 5: Log transformation — taming right skew
# =============================================================================
# log() compresses big values and stretches small ones, making skewed
# distributions more symmetric — a prerequisite for many statistical methods
data_earnings <- data_earnings |>
mutate(lnearnings = log(earnings))
# Compare original vs log-transformed distributions side by side
library(patchwork) # combine ggplots side by side with + operator
p1 <- ggplot(data_earnings, aes(x = earnings)) +
geom_histogram(bins = 20, fill = "steelblue", color = "black", alpha = 0.7) +
labs(x = "Earnings ($)",
title = paste0("Original (skewness = ",
round(skewness(earnings), 2), ")")) +
theme_minimal()
p2 <- ggplot(data_earnings, aes(x = lnearnings)) +
geom_histogram(bins = 20, fill = "coral", color = "black", alpha = 0.7) +
labs(x = "ln(Earnings)",
title = paste0("Log-transformed (skewness = ",
round(skewness(data_earnings$lnearnings), 2), ")")) +
theme_minimal()
p1 + p2 + plot_annotation(title = "Effect of Log Transformation on Skewness")
# =============================================================================
# STEP 6: Z-scores — how unusual is each observation?
# =============================================================================
# z = (x - mean) / sd puts every value on a "standard deviations from the
# mean" scale: |z| > 2 is unusual, |z| > 3 is very unusual
z_scores <- scale(earnings)[, 1] # scale() returns a matrix; [,1] extracts vector
cat("Highest earner: $", max(earnings), " \u2192 z =", round(max(z_scores), 2), "\n")
cat("Median earner: $", median(earnings), " \u2192 z =",
round((median(earnings) - mean(earnings)) / sd(earnings), 2), "\n")
cat("Observations with |z| > 2:", sum(abs(z_scores) > 2),
"out of", length(z_scores), "\n")
# =============================================================================
# STEP 7: Time series — moving average smooths seasonal noise
# =============================================================================
# Monthly home sales zigzag with the seasons; an 11-month moving average
# cancels one full seasonal cycle, revealing the underlying trend
url_homesales <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_MONTHLYHOMESALES.DTA"
data_hs <- read_dta(url_homesales) |>
filter(year >= 2005)
# zoo::rollmean computes a centered moving average; k = window width
library(zoo) # rollmean() for moving averages
data_hs <- data_hs |>
mutate(exsales_smooth = rollmean(exsales, k = 11, fill = NA, align = "center"))
ggplot(data_hs, aes(x = daten)) +
geom_line(aes(y = exsales), linewidth = 0.5, alpha = 0.6) +
geom_line(aes(y = exsales_smooth), color = "red",
linewidth = 1.2, linetype = "dashed") +
labs(x = "Year", y = "Monthly Home Sales",
title = "U.S. Home Sales: Raw Series vs. Moving Average (2005\u20132015)") +
theme_minimal()
Paste into your R console or RStudio
Keep learning
You have used every widget. The full chapter covers everything here plus case studies (cross-country distributions, convergence, spatial data) that are not in the dashboard.