Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 3 CHEAT SHEET: The Sample Mean
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations and random sampling
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
from scipy import stats # normal distribution PDF for overlays
# =============================================================================
# STEP 1: Load pre-computed sample means from coin toss experiments
# =============================================================================
# 400 samples of 30 coin tosses each — precomputed in the textbook dataset
url_coin = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_COINTOSSMEANS.DTA"
data_coin = pd.read_stata(url_coin)
xbar_coin = data_coin['xbar']
print(f"Coin toss experiment: {len(xbar_coin)} sample means (each from n=30 tosses)")
print(f"Mean of sample means: {xbar_coin.mean():.4f} (theoretical μ = 0.5)")
print(f"SD of sample means: {xbar_coin.std():.4f} (theoretical σ/√n = {np.sqrt(0.25/30):.4f})")
# =============================================================================
# STEP 2: Visualize the sampling distribution with normal overlay
# =============================================================================
# The histogram of 400 sample means approximates the sampling distribution of X̄
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(xbar_coin, bins=25, density=True, edgecolor='black', alpha=0.7,
label='400 sample means')
# Overlay theoretical normal: N(μ, σ²/n)
theo_se = np.sqrt(0.25 / 30)
x_range = np.linspace(xbar_coin.min(), xbar_coin.max(), 100)
ax.plot(x_range, stats.norm.pdf(x_range, 0.5, theo_se),
'r-', linewidth=2.5, label=f'N(0.5, {theo_se:.3f}²)')
ax.set_xlabel('Sample Mean (proportion of heads)')
ax.set_ylabel('Density')
ax.set_title('Sampling Distribution of X̄ from 400 Coin Toss Experiments (n=30)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 3: Central Limit Theorem — non-normal population still gives normal X̄
# =============================================================================
# 1880 U.S. Census ages: highly skewed population, yet sample means are normal
url_census = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_CENSUSAGEMEANS.DTA"
data_census = pd.read_stata(url_census)
# Identify the sample mean column
if 'mean' in data_census.columns:
age_means = data_census['mean']
elif 'xmean' in data_census.columns:
age_means = data_census['xmean']
else:
age_means = data_census.iloc[:, 0]
print(f"\n1880 Census: {len(age_means)} sample means (each from n=25 people)")
print(f"Mean of sample means: {age_means.mean():.2f} years (theoretical μ = 24.13)")
print(f"SD of sample means: {age_means.std():.2f} years (theoretical σ/√n = {18.61/np.sqrt(25):.2f})")
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(age_means, bins=20, density=True, edgecolor='black', alpha=0.7,
label='100 sample means')
age_range = np.linspace(age_means.min(), age_means.max(), 100)
ax.plot(age_range, stats.norm.pdf(age_range, 24.13, 18.61 / np.sqrt(25)),
'r-', linewidth=2.5, label=f'N(24.13, {18.61/np.sqrt(25):.2f}²)')
ax.set_xlabel('Sample Mean Age (years)')
ax.set_ylabel('Density')
ax.set_title('CLT in Action: Normal Sample Means from a Skewed Population')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 4: Standard error — how sample size affects precision
# =============================================================================
# SE = σ/√n: to halve the SE, you must quadruple the sample size
sigma = 0.5 # coin toss population std dev
print(f"\nStandard error vs sample size (σ = {sigma}):")
print(f"{'n':<10} {'SE = σ/√n':<15} {'Var(X̄) = σ²/n':<15}")
print("-" * 40)
for n in [10, 30, 100, 400, 1000]:
se = sigma / np.sqrt(n)
var_xbar = sigma**2 / n
print(f"{n:<10} {se:<15.4f} {var_xbar:<15.6f}")
# =============================================================================
# STEP 5: Monte Carlo simulation — verify the theory computationally
# =============================================================================
# Simulate 1000 samples of 30 coin tosses to see the CLT converge
np.random.seed(10101)
n_sims = 1000
sample_size = 30
sim_means = np.array([np.random.binomial(1, 0.5, sample_size).mean()
for _ in range(n_sims)])
print(f"\nMonte Carlo simulation ({n_sims} samples, n={sample_size}):")
print(f"Mean of simulated means: {sim_means.mean():.4f} (theoretical: 0.5)")
print(f"SD of simulated means: {sim_means.std():.4f} (theoretical: {np.sqrt(0.25/30):.4f})")
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(sim_means, bins=30, density=True, edgecolor='black', alpha=0.7,
label=f'{n_sims} simulated means')
x_range = np.linspace(sim_means.min(), sim_means.max(), 100)
ax.plot(x_range, stats.norm.pdf(x_range, 0.5, np.sqrt(0.25/30)),
'r-', linewidth=2.5, label='Theoretical N(0.5, 0.091²)')
ax.set_xlabel('Sample Mean')
ax.set_ylabel('Density')
ax.set_title('Monte Carlo Simulation vs Theoretical Sampling Distribution')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 6: Weighted means — correcting for nonrepresentative samples
# =============================================================================
# When inclusion probabilities differ, the unweighted mean is biased;
# inverse-probability weights w_i = 1/π_i recover the true population mean
np.random.seed(42)
income_men = np.random.normal(60000, 15000, 50)
income_women = np.random.normal(50000, 15000, 50)
true_pop_mean = (income_men.mean() + income_women.mean()) / 2
# Biased sample: oversample women (70% women, 30% men)
sample_men = np.random.choice(income_men, size=15, replace=False)
sample_women = np.random.choice(income_women, size=35, replace=False)
sample = np.concatenate([sample_men, sample_women])
# Unweighted mean is biased toward the oversampled group
unweighted = sample.mean()
# Weighted mean with IPW: w_i = 1/π_i corrects the imbalance
weights = np.concatenate([np.repeat(1/0.3, 15), np.repeat(1/0.7, 35)])
weighted = np.average(sample, weights=weights)
print(f"\nWeighted vs Unweighted Means:")
print(f"True population mean: ${true_pop_mean:,.0f}")
print(f"Unweighted mean: ${unweighted:,.0f} (bias: ${unweighted - true_pop_mean:,.0f})")
print(f"Weighted mean (IPW): ${weighted:,.0f} (bias: ${weighted - true_pop_mean:,.0f})")
* =============================================================================
* CHAPTER 3 CHEAT SHEET: The Sample Mean
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load pre-computed sample means from coin toss experiments
* =============================================================================
* 400 samples of 30 coin tosses each — precomputed in the textbook dataset
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_COINTOSSMEANS.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* Compute mean and SD of the sample means
summarize xbar
display "Mean of sample means: " r(mean) // theoretical μ = 0.5
display "SD of sample means: " r(sd) // theoretical σ/√n = 0.0913
* =============================================================================
* STEP 2: Visualize the sampling distribution with normal overlay
* =============================================================================
* The histogram of 400 sample means approximates the sampling distribution
* kdensity overlays a smooth kernel density estimate
histogram xbar, kdensity normal ///
xtitle("Sample Mean (proportion of heads)") ///
ytitle("Density") ///
title("Sampling Distribution of X-bar from 400 Coin Toss Experiments (n=30)")
* =============================================================================
* STEP 3: Central Limit Theorem — non-normal population still gives normal X-bar
* =============================================================================
* 1880 U.S. Census ages: highly skewed population, yet sample means are normal
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_CENSUSAGEMEANS.DTA", clear
describe
summarize // identify the sample mean variable
* Use the first numeric variable (sample means of ages)
ds, has(type numeric) // list numeric variables
* Summarize the sample means
summarize xmean, detail
display "Mean of sample means: " r(mean) // theoretical μ = 24.13
display "SD of sample means: " r(sd) // theoretical σ/√n = 3.72
* Histogram with normal overlay demonstrates CLT
histogram xmean, kdensity normal ///
xtitle("Sample Mean Age (years)") ///
ytitle("Density") ///
title("CLT in Action: Normal Sample Means from a Skewed Population")
* =============================================================================
* STEP 4: Standard error — how sample size affects precision
* =============================================================================
* SE = σ/√n: to halve the SE, you must quadruple the sample size
* Demonstrate with coin toss: σ = 0.5
display ""
display "Standard error vs sample size (σ = 0.5):"
display "{hline 40}"
display %~10s "n" %~15s "SE = σ/√n" %~15s "Var(X-bar)"
display "{hline 40}"
foreach n in 10 30 100 400 1000 {
local se = 0.5 / sqrt(`n')
local var_xbar = 0.25 / `n'
display %10.0f `n' %15.4f `se' %15.6f `var_xbar'
}
* =============================================================================
* STEP 5: Monte Carlo simulation — verify the theory computationally
* =============================================================================
* Simulate 1000 samples of 30 coin tosses to see the CLT converge
clear
set seed 10101
set obs 1000 // one observation per simulation run
* Generate each simulated mean: average of 30 Bernoulli(0.5) draws
gen sim_mean = .
forvalues i = 1/1000 {
// Draw 30 coin tosses and compute mean
quietly {
preserve
clear
set obs 30
gen toss = runiform() < 0.5
summarize toss
local m = r(mean)
restore
replace sim_mean = `m' in `i'
}
}
summarize sim_mean
display ""
display "Monte Carlo simulation (1000 samples, n=30):"
display "Mean of simulated means: " r(mean) // theoretical: 0.5
display "SD of simulated means: " r(sd) // theoretical: 0.0913
histogram sim_mean, kdensity normal ///
xtitle("Sample Mean") ///
ytitle("Density") ///
title("Monte Carlo Simulation vs Theoretical Sampling Distribution")
* =============================================================================
* STEP 6: Weighted means — correcting for nonrepresentative samples
* =============================================================================
* When inclusion probabilities differ, the unweighted mean is biased;
* inverse-probability weights w_i = 1/π_i recover the true population mean
clear
set seed 42
* Create a population: 50 men (mean 60000) and 50 women (mean 50000)
set obs 100
gen male = (_n <= 50)
gen income = rnormal(60000, 15000) if male == 1
replace income = rnormal(50000, 15000) if male == 0
* True population mean
summarize income
local true_mean = r(mean)
* Biased sample: oversample women (70% women, 30% men)
* Tag 15 random men and 35 random women
gen rand = runiform()
sort male rand
gen in_sample = 0
// First 15 men (male==1 block is obs 51-100 after sort)
by male: replace in_sample = 1 if male == 1 & _n <= 15
by male: replace in_sample = 1 if male == 0 & _n <= 35
keep if in_sample == 1
* Unweighted mean (biased toward oversampled group)
summarize income
local unweighted = r(mean)
* Create IPW weights: w_i = 1/π_i
gen ipw = 1/0.3 if male == 1
replace ipw = 1/0.7 if male == 0
* Weighted mean corrects the imbalance
summarize income [aw=ipw]
local weighted = r(mean)
display ""
display "Weighted vs Unweighted Means:"
display "True population mean: $" %9.0fc `true_mean'
display "Unweighted mean: $" %9.0fc `unweighted'
display "Weighted mean (IPW): $" %9.0fc `weighted'
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 3 CHEAT SHEET: The Sample Mean
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files directly from URLs
library(ggplot2) # grammar of graphics for all plots
library(dplyr) # data manipulation
# =============================================================================
# STEP 1: Load pre-computed sample means from coin toss experiments
# =============================================================================
# 400 samples of 30 coin tosses each — precomputed in the textbook dataset
url_coin <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_COINTOSSMEANS.DTA"
data_coin <- read_dta(url_coin)
cat("Coin toss experiment:", nrow(data_coin), "sample means (each from n=30 tosses)\n")
cat("Mean of sample means:", round(mean(data_coin$xbar), 4),
" (theoretical μ = 0.5)\n")
cat("SD of sample means: ", round(sd(data_coin$xbar), 4),
" (theoretical σ/√n =", round(sqrt(0.25/30), 4), ")\n")
# =============================================================================
# STEP 2: Visualize the sampling distribution with normal overlay
# =============================================================================
# The histogram of 400 sample means approximates the sampling distribution
theo_se <- sqrt(0.25 / 30)
ggplot(data_coin, aes(x = xbar)) +
geom_histogram(aes(y = after_stat(density)), bins = 25,
fill = "steelblue", color = "black", alpha = 0.7) +
stat_function(fun = dnorm, args = list(mean = 0.5, sd = theo_se),
color = "red", linewidth = 1.5) +
labs(x = "Sample Mean (proportion of heads)", y = "Density",
title = "Sampling Distribution of X-bar from 400 Coin Toss Experiments (n=30)") +
theme_minimal()
# =============================================================================
# STEP 3: Central Limit Theorem — non-normal population still gives normal X-bar
# =============================================================================
# 1880 U.S. Census ages: highly skewed population, yet sample means are normal
url_census <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_CENSUSAGEMEANS.DTA"
data_census <- read_dta(url_census)
# Identify the sample mean column
age_means <- if ("xmean" %in% names(data_census)) data_census$xmean else data_census[[1]]
cat("\n1880 Census:", length(age_means), "sample means\n")
cat("Mean of sample means:", round(mean(age_means), 2), "years\n")
cat("SD of sample means: ", round(sd(age_means), 2), "years\n")
ggplot(data.frame(xmean = age_means), aes(x = xmean)) +
geom_histogram(aes(y = after_stat(density)), bins = 20,
fill = "steelblue", color = "black", alpha = 0.7) +
stat_function(fun = dnorm, args = list(mean = 24.13, sd = 18.61/sqrt(25)),
color = "red", linewidth = 1.5) +
labs(x = "Sample Mean Age (years)", y = "Density",
title = "CLT in Action: Normal Sample Means from a Skewed Population") +
theme_minimal()
# =============================================================================
# STEP 4: Standard error — how sample size affects precision
# =============================================================================
# SE = σ/√n: to halve the SE, you must quadruple the sample size
sigma <- 0.5
se_table <- data.frame(
n = c(10, 30, 100, 400, 1000),
SE = sigma / sqrt(c(10, 30, 100, 400, 1000)),
Var_Xbar = sigma^2 / c(10, 30, 100, 400, 1000)
)
print(se_table, row.names = FALSE)
# =============================================================================
# STEP 5: Monte Carlo simulation — verify the theory computationally
# =============================================================================
# Simulate 1000 samples of 30 coin tosses to see the CLT converge
set.seed(10101)
n_sims <- 1000
sample_size <- 30
# replicate() runs an expression n_sims times and collects results
sim_means <- replicate(n_sims, mean(rbinom(sample_size, 1, 0.5)))
cat("\nMonte Carlo simulation (", n_sims, "samples, n =", sample_size, "):\n")
cat("Mean of simulated means:", round(mean(sim_means), 4), " (theoretical: 0.5)\n")
cat("SD of simulated means: ", round(sd(sim_means), 4),
" (theoretical:", round(sqrt(0.25/30), 4), ")\n")
ggplot(data.frame(x = sim_means), aes(x = x)) +
geom_histogram(aes(y = after_stat(density)), bins = 30,
fill = "steelblue", color = "white", alpha = 0.7) +
stat_function(fun = dnorm, args = list(mean = 0.5, sd = sqrt(0.25/30)),
color = "red", linewidth = 1.5) +
labs(x = "Sample Mean", y = "Density",
title = "Monte Carlo Simulation vs Theoretical Sampling Distribution") +
theme_minimal()
# =============================================================================
# STEP 6: Weighted means — correcting for nonrepresentative samples
# =============================================================================
# When inclusion probabilities differ, the unweighted mean is biased;
# inverse-probability weights w_i = 1/π_i recover the true population mean
set.seed(42)
income_men <- rnorm(50, mean = 60000, sd = 15000)
income_women <- rnorm(50, mean = 50000, sd = 15000)
true_pop_mean <- (mean(income_men) + mean(income_women)) / 2
# Biased sample: oversample women (70% women, 30% men)
sample_men <- sample(income_men, 15, replace = FALSE)
sample_women <- sample(income_women, 35, replace = FALSE)
sample_income <- c(sample_men, sample_women)
# Unweighted mean is biased toward the oversampled group
unweighted <- mean(sample_income)
# Weighted mean with IPW: w_i = 1/π_i corrects the imbalance
weights <- c(rep(1/0.3, 15), rep(1/0.7, 35))
weighted_mean <- weighted.mean(sample_income, weights)
cat("\nWeighted vs Unweighted Means:\n")
cat("True population mean: $", round(true_pop_mean, 0), "\n")
cat("Unweighted mean: $", round(unweighted, 0), "\n")
cat("Weighted mean (IPW): $", round(weighted_mean, 0), "\n")
Paste into your R console or RStudio