Code Summary
You've explored the key concepts interactively — now reproduce them in code. These self-contained blocks cover everything you practiced above. Pick your language, copy the code, and run it.
# =============================================================================
# CHAPTER 4 CHEAT SHEET: Statistical Inference for the Mean
# =============================================================================
# --- Libraries ---
import numpy as np # numerical operations (sqrt, linspace)
import pandas as pd # data loading and manipulation
import matplotlib.pyplot as plt # creating plots and visualizations
from scipy import stats # t-distribution and normal distribution
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# pd.read_stata() reads Stata .dta files — this sample has 171 female workers
url = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA"
data_earnings = pd.read_stata(url)
earnings = data_earnings['earnings']
print(f"Dataset: {len(earnings)} observations")
# =============================================================================
# STEP 2: Sample statistics and standard error
# =============================================================================
# The standard error measures the precision of the sample mean as an estimate
# of the population mean. It shrinks with sqrt(n), not n — so quadrupling the
# sample size is needed to halve the SE.
n = len(earnings)
mean_earnings = earnings.mean()
std_earnings = earnings.std(ddof=1) # ddof=1 for sample std dev
se_earnings = std_earnings / np.sqrt(n) # standard error = s / sqrt(n)
print(f"Sample mean: ${mean_earnings:,.2f}")
print(f"Standard deviation: ${std_earnings:,.2f}")
print(f"Standard error: ${se_earnings:,.2f}")
# =============================================================================
# STEP 3: t-distribution vs standard normal
# =============================================================================
# When sigma is unknown (estimated by s), we use the t-distribution instead of
# the normal. It has fatter tails — more probability in the extremes — but
# approaches the normal as n grows (the "n >= 30" rule of thumb).
x = np.linspace(-4, 4, 200)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, stats.norm.pdf(x), '--', linewidth=2, label='Standard Normal')
ax.plot(x, stats.t.pdf(x, df=4), linewidth=2, label='t(4) — fatter tails')
ax.plot(x, stats.t.pdf(x, df=30), linewidth=2, label='t(30) — nearly normal')
ax.set_xlabel('t value')
ax.set_ylabel('Density')
ax.set_title('t-Distribution Approaches Normal as Degrees of Freedom Increase')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# =============================================================================
# STEP 4: Confidence interval — a range of plausible values for mu
# =============================================================================
# A 95% CI means: if we repeatedly sampled and built CIs, about 95% would
# contain the true mu. Formula: x-bar +/- t_crit * SE
alpha = 0.05
t_crit = stats.t.ppf(1 - alpha / 2, n - 1) # critical value from t(n-1)
margin = t_crit * se_earnings
ci_lower = mean_earnings - margin
ci_upper = mean_earnings + margin
print(f"Critical value (t_170, 0.025): {t_crit:.4f}")
print(f"Margin of error: ${margin:,.2f}")
print(f"95% CI: [${ci_lower:,.2f}, ${ci_upper:,.2f}]")
# Compare 90%, 95%, and 99% — higher confidence requires wider intervals
print(f"\n{'Level':<8} {'Lower':>12} {'Upper':>12} {'Width':>10}")
print("-" * 44)
for conf in [0.90, 0.95, 0.99]:
tc = stats.t.ppf(1 - (1 - conf) / 2, n - 1)
lo = mean_earnings - tc * se_earnings
hi = mean_earnings + tc * se_earnings
print(f"{conf*100:.0f}%{lo:>14,.2f}{hi:>14,.2f}{hi - lo:>12,.2f}")
# =============================================================================
# STEP 5: Two-sided hypothesis test — H0: mu = $40,000
# =============================================================================
# The t-statistic measures how many standard errors the estimate is from the
# hypothesized value. The p-value is the probability of a result at least as
# extreme as ours if H0 were true.
mu0 = 40000
t_stat = (mean_earnings - mu0) / se_earnings
p_val = 2 * (1 - stats.t.cdf(abs(t_stat), n - 1)) # two-sided p-value
print(f"H0: mu = ${mu0:,} vs Ha: mu != ${mu0:,}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_val:.4f}")
print(f"Decision: {'Reject H0' if p_val < 0.05 else 'Do not reject H0'} at alpha = 0.05")
# One-sided (upper-tailed): H0: mu <= 40000 vs Ha: mu > 40000
p_val_one = 1 - stats.t.cdf(t_stat, n - 1) # upper tail only
t_crit_one = stats.t.ppf(0.95, n - 1) # one-sided critical value
print(f"\nOne-sided (upper): p-value = {p_val_one:.4f} (= two-sided / 2)")
print(f"One-sided critical value: {t_crit_one:.4f} (vs two-sided {t_crit:.4f})")
# =============================================================================
# STEP 6: Statistical vs practical significance — gasoline prices
# =============================================================================
# A $0.14 price gap looks small, but the SE is tiny — so the t-statistic is
# huge and p < 0.0001. Statistical significance says "the difference is real";
# practical significance asks "does it matter?"
url_gas = "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_GASPRICE.DTA"
data_gas = pd.read_stata(url_gas)
price = data_gas['price']
n_gas = len(price)
mean_gas = price.mean()
se_gas = price.std(ddof=1) / np.sqrt(n_gas)
mu0_gas = 3.81 # CA state average
t_gas = (mean_gas - mu0_gas) / se_gas
p_gas = 2 * (1 - stats.t.cdf(abs(t_gas), n_gas - 1))
print(f"Gas price test: H0: mu = ${mu0_gas}")
print(f"Sample mean: ${mean_gas:.4f} SE: ${se_gas:.4f}")
print(f"t = {t_gas:.4f}, p = {p_gas:.6f}")
print(f"Decision: {'Reject H0' if p_gas < 0.05 else 'Do not reject H0'}")
print(f"Practical: ${mu0_gas - mean_gas:.2f}/gallon * 15 gal = ${(mu0_gas - mean_gas)*15:.2f} per tank")
# =============================================================================
# STEP 7: Inference for proportions — binary (0/1) data
# =============================================================================
# Proportions are just means of 0/1 variables. The same CI and hypothesis-test
# logic applies — only the SE formula changes: sqrt(p_hat*(1-p_hat)/n).
n_voters = 921
n_dem = 480
p_hat = n_dem / n_voters
se_prop = np.sqrt(p_hat * (1 - p_hat) / n_voters)
# 95% confidence interval (use z for large-sample proportions)
ci_lo = p_hat - 1.96 * se_prop
ci_hi = p_hat + 1.96 * se_prop
# Hypothesis test: H0: p = 0.50
p0 = 0.50
se_h0 = np.sqrt(p0 * (1 - p0) / n_voters)
z_stat = (p_hat - p0) / se_h0
p_val_z = 2 * (1 - stats.norm.cdf(abs(z_stat)))
print(f"Sample proportion: {p_hat:.4f} ({p_hat*100:.1f}%)")
print(f"SE: {se_prop:.4f}")
print(f"95% CI: [{ci_lo:.4f}, {ci_hi:.4f}]")
print(f"\nH0: p = {p0} z = {z_stat:.4f} p-value = {p_val_z:.4f}")
print(f"Decision: {'Reject H0' if abs(z_stat) > 1.96 else 'Do not reject H0'}")
* =============================================================================
* CHAPTER 4 CHEAT SHEET: Statistical Inference for the Mean
* =============================================================================
* --- Setup ---
clear all // start with a clean workspace
set more off // do not pause output for long results
* =============================================================================
* STEP 1: Load data directly from a URL
* =============================================================================
* use loads a Stata .dta file; "clear" drops any data already in memory
* This sample has 171 female workers from the AED_EARNINGS dataset
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA", clear
describe // list all variables, types, and labels
display "Observations: " _N // _N is Stata's built-in observation count
* =============================================================================
* STEP 2: Sample statistics and standard error
* =============================================================================
* The standard error measures the precision of the sample mean as an estimate
* of the population mean. It shrinks with sqrt(n), not n — so quadrupling the
* sample size is needed to halve the SE.
summarize earnings
// After summarize, Stata stores results in r() that you can reference:
display "Sample mean: $" %10.2fc r(mean)
display "Standard deviation: $" %10.2fc r(sd)
display "Standard error: $" %10.2fc r(sd) / sqrt(r(N))
* =============================================================================
* STEP 3: t-distribution vs standard normal
* =============================================================================
* When sigma is unknown (estimated by s), we use the t-distribution instead of
* the normal. It has fatter tails — more probability in the extremes — but
* approaches the normal as n grows (the "n >= 30" rule of thumb).
// Generate a grid of t values and plot the densities
clear
set obs 200
gen t = -4 + (_n - 1) * 8 / 199
// tden(df, t) gives the t-density; normalden(t) gives the standard normal
gen pdf_normal = normalden(t)
gen pdf_t4 = tden(4, t)
gen pdf_t30 = tden(30, t)
twoway (line pdf_normal t, lpattern(dash) lwidth(medthick)) ///
(line pdf_t4 t, lwidth(medthick)) ///
(line pdf_t30 t, lwidth(medthick)), ///
xtitle("t value") ytitle("Density") ///
title("t-Distribution Approaches Normal as Degrees of Freedom Increase") ///
legend(order(1 "Standard Normal" 2 "t(4) — fatter tails" ///
3 "t(30) — nearly normal"))
* =============================================================================
* STEP 4: Confidence interval — a range of plausible values for mu
* =============================================================================
* A 95% CI means: if we repeatedly sampled and built CIs, about 95% would
* contain the true mu. Formula: x-bar +/- t_crit * SE
// Reload the earnings data
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA", clear
// ci means computes confidence intervals directly
ci means earnings, level(95)
// Compare 90%, 95%, and 99% — higher confidence requires wider intervals
ci means earnings, level(90)
ci means earnings, level(95)
ci means earnings, level(99)
// Manual calculation to see the components
summarize earnings
local n = r(N)
local xbar = r(mean)
local se = r(sd) / sqrt(r(N))
local tcrit = invttail(`n' - 1, 0.025) // two-tailed critical value
local margin = `tcrit' * `se'
display "Critical value (t_170, 0.025): " %8.4f `tcrit'
display "Margin of error: $" %10.2fc `margin'
display "95% CI: [$" %10.2fc `xbar' - `margin' ", $" %10.2fc `xbar' + `margin' "]"
* =============================================================================
* STEP 5: Two-sided hypothesis test — H0: mu = $40,000
* =============================================================================
* The t-statistic measures how many standard errors the estimate is from the
* hypothesized value. The p-value is the probability of a result at least as
* extreme as ours if H0 were true.
// ttest performs the hypothesis test in one command
ttest earnings == 40000
// Stata displays: t-statistic, degrees of freedom, and p-values for
// Ha: mu != 40000 (two-sided), Ha: mu < 40000, and Ha: mu > 40000
// The stored results can be accessed:
display "t-statistic: " %8.4f r(t)
display "p-value (two-sided): " %8.4f r(p)
// One-sided (upper-tailed): H0: mu <= 40000 vs Ha: mu > 40000
// The one-sided p-value is half the two-sided p-value (for the correct tail)
display "p-value (one-sided, upper): " %8.4f r(p) / 2
* =============================================================================
* STEP 6: Statistical vs practical significance — gasoline prices
* =============================================================================
* A $0.14 price gap looks small, but the SE is tiny — so the t-statistic is
* huge and p < 0.0001. Statistical significance says "the difference is real";
* practical significance asks "does it matter?"
use "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_GASPRICE.DTA", clear
summarize price
display "Sample mean: $" %6.4f r(mean) " SE: $" %6.4f r(sd) / sqrt(r(N))
// Test against the CA state average of $3.81
ttest price == 3.81
display "Decision: " cond(r(p) < 0.05, "Reject H0", "Do not reject H0")
// Practical significance: how much does the difference cost per tank?
local diff = 3.81 - r(mu_1)
display "Practical: $" %5.2f `diff' "/gallon * 15 gal = $" %5.2f `diff' * 15 " per tank"
* =============================================================================
* STEP 7: Inference for proportions — binary (0/1) data
* =============================================================================
* Proportions are just means of 0/1 variables. The same CI and hypothesis-test
* logic applies — only the SE formula changes: sqrt(p_hat*(1-p_hat)/n).
// Create a dataset with 921 voters: 480 Democrat (1), 441 other (0)
clear
set obs 921
gen democrat = (_n <= 480)
// Compute the sample proportion and confidence interval
ci proportions democrat
// Hypothesis test: H0: p = 0.50
prtest democrat == 0.50
// Manual calculation to see the components
summarize democrat
local p_hat = r(mean)
local n = r(N)
local se_p = sqrt(`p_hat' * (1 - `p_hat') / `n')
local se_h0 = sqrt(0.50 * 0.50 / `n')
local z = (`p_hat' - 0.50) / `se_h0'
local pval = 2 * (1 - normal(abs(`z')))
display "Sample proportion: " %6.4f `p_hat' " (" %4.1f `p_hat' * 100 "%)"
display "SE: " %6.4f `se_p'
display "95% CI: [" %6.4f `p_hat' - 1.96 * `se_p' ", " %6.4f `p_hat' + 1.96 * `se_p' "]"
display "H0: p = 0.50 z = " %6.4f `z' " p-value = " %6.4f `pval'
display "Decision: " cond(abs(`z') > 1.96, "Reject H0", "Do not reject H0")
Paste into your Stata do-file editor
# =============================================================================
# CHAPTER 4 CHEAT SHEET: Statistical Inference for the Mean
# =============================================================================
# --- Libraries ---
library(haven) # read Stata .dta files directly from URLs
library(ggplot2) # grammar of graphics for all plots
# =============================================================================
# STEP 1: Load data directly from a URL
# =============================================================================
# read_dta() reads Stata .dta files — this sample has 171 female workers
url <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_EARNINGS.DTA"
data_earnings <- read_dta(url)
earnings <- data_earnings$earnings
cat("Dataset:", length(earnings), "observations\n")
# =============================================================================
# STEP 2: Sample statistics and standard error
# =============================================================================
# The standard error measures the precision of the sample mean as an estimate
# of the population mean. It shrinks with sqrt(n), not n — so quadrupling the
# sample size is needed to halve the SE.
n <- length(earnings)
mean_earnings <- mean(earnings)
std_earnings <- sd(earnings) # sd() uses n-1 denominator (sample SD)
se_earnings <- std_earnings / sqrt(n) # standard error = s / sqrt(n)
cat("Sample mean: $", round(mean_earnings, 2), "\n")
cat("Standard deviation:$", round(std_earnings, 2), "\n")
cat("Standard error: $", round(se_earnings, 2), "\n")
# =============================================================================
# STEP 3: t-distribution vs standard normal
# =============================================================================
# When sigma is unknown (estimated by s), we use the t-distribution instead of
# the normal. It has fatter tails but approaches the normal as n grows.
x <- seq(-4, 4, length.out = 200)
ggplot(data.frame(x = x), aes(x = x)) +
stat_function(fun = dnorm, linetype = "dashed", linewidth = 1.2,
aes(color = "Standard Normal")) +
stat_function(fun = dt, args = list(df = 4), linewidth = 1.2,
aes(color = "t(4) — fatter tails")) +
stat_function(fun = dt, args = list(df = 30), linewidth = 1.2,
aes(color = "t(30) — nearly normal")) +
labs(x = "t value", y = "Density",
title = "t-Distribution Approaches Normal as Degrees of Freedom Increase",
color = NULL) +
theme_minimal() +
theme(legend.position = "bottom")
# =============================================================================
# STEP 4: Confidence interval — a range of plausible values for mu
# =============================================================================
# A 95% CI means: if we repeatedly sampled and built CIs, about 95% would
# contain the true mu. Formula: x-bar +/- t_crit * SE
alpha <- 0.05
t_crit <- qt(1 - alpha/2, df = n - 1) # critical value from t(n-1)
margin <- t_crit * se_earnings
ci_lower <- mean_earnings - margin
ci_upper <- mean_earnings + margin
cat("Critical t-value (t_170, 0.025):", round(t_crit, 4), "\n")
cat("Margin of error: $", round(margin, 2), "\n")
cat("95% CI: [$", round(ci_lower, 2), ", $", round(ci_upper, 2), "]\n")
# Compare 90%, 95%, and 99% — higher confidence requires wider intervals
for (conf in c(0.90, 0.95, 0.99)) {
tc <- qt(1 - (1 - conf)/2, df = n - 1)
lo <- mean_earnings - tc * se_earnings
hi <- mean_earnings + tc * se_earnings
cat(sprintf("%3.0f%% CI: [$%.2f, $%.2f] width = $%.2f\n",
conf*100, lo, hi, hi - lo))
}
# =============================================================================
# STEP 5: Two-sided hypothesis test — H0: mu = $40,000
# =============================================================================
# The t-statistic measures how many standard errors the estimate is from the
# hypothesized value. The p-value is the probability of a result at least as
# extreme as ours if H0 were true.
mu0 <- 40000
t_stat <- (mean_earnings - mu0) / se_earnings
p_val <- 2 * pt(-abs(t_stat), df = n - 1) # two-sided p-value
cat("\nH0: mu = $40,000 vs Ha: mu != $40,000\n")
cat("t-statistic:", round(t_stat, 4), "\n")
cat("p-value: ", round(p_val, 4), "\n")
cat("Decision: ", ifelse(p_val < 0.05, "Reject H0", "Do not reject H0"),
"at alpha = 0.05\n")
# t.test() does all of this in one call:
t.test(earnings, mu = 40000)
# =============================================================================
# STEP 6: Statistical vs practical significance — gasoline prices
# =============================================================================
url_gas <- "https://raw.githubusercontent.com/quarcs-lab/data-open/master/AED/AED_GASPRICE.DTA"
data_gas <- read_dta(url_gas)
price <- data_gas$price
# Test against the CA state average of $3.81
result_gas <- t.test(price, mu = 3.81)
cat("\nGas price test: t =", round(result_gas$statistic, 4),
", p =", round(result_gas$p.value, 6), "\n")
cat("Practical: $", round(3.81 - mean(price), 2), "/gallon x 15 gal = $",
round((3.81 - mean(price)) * 15, 2), "per tank\n")
# =============================================================================
# STEP 7: Inference for proportions — binary (0/1) data
# =============================================================================
# Proportions are just means of 0/1 variables. The same CI and hypothesis-test
# logic applies — only the SE formula changes: sqrt(p_hat*(1-p_hat)/n).
n_voters <- 921
n_dem <- 480
p_hat <- n_dem / n_voters
# prop.test() gives CI and hypothesis test in one call
result_prop <- prop.test(n_dem, n_voters, p = 0.50, correct = FALSE)
cat("\nSample proportion:", round(p_hat, 4), "(", round(p_hat*100, 1), "%)\n")
cat("95% CI: [", round(result_prop$conf.int[1], 4), ",",
round(result_prop$conf.int[2], 4), "]\n")
cat("H0: p = 0.50 p-value =", round(result_prop$p.value, 4), "\n")
cat("Decision:", ifelse(result_prop$p.value < 0.05,
"Reject H0", "Do not reject H0"), "\n")
Paste into your R console or RStudio