# **************** You don't need to modify anything in this cell.*****************
# **************** Just understand what it does. It will be useful later.*****************

# Ka Yan defined the following function to calculate the OLS estimates
def ols_ky_manual(y, x):

    # Number of observations
    n = len(y)

    # Calculate means
    x_mean = x.mean()
    y_mean = y.mean()
    
    # Calculate OLS estimates
    beta1_hat = ((x - x_mean) * (y - y_mean)).sum() / ((x - x_mean)**2).sum()
    beta0_hat = y_mean - beta1_hat * x_mean
    
    return beta0_hat, beta1_hat

# **************** You don't need to modify anything in this cell.*****************
# **************** Just understand what it does. It will be useful later.*****************

# Q0. DGP for baseline linear regression model
import numpy as np
import numpy.random as rng
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt 

# Setting 1: number of random samples
B = 1000 

# Setting 2: sample size per dataset
n = 200 

# Setting 3: true parameters
beta0_true, beta1_true = 2.0, 1.5
sigma = 2.0

# Create a empty list to store slope estimates
b1_estimates = []

# Simulate B datasets
# Repeat B times: generate data, fit OLS, store slope estimate
for _ in range(B):

    # x is randomly sampled from uniform(-3, 3)
    x = rng.uniform(-3, 3, size=n)

    # u is randomly sampled from normal(0, sigma)
    u = rng.normal(0, sigma, size=n)

    # y is generated from the DGP
    y = beta0_true + beta1_true*x + u

    # Fit OLS and store the slope estimate
    b0, b1 = ols_ky_manual(y, x)

    # Append the slope estimate to the list
    b1_estimates.append(b1)

print(f"True β1 = {beta1_true:.2f}")
print(f"Mean of estimated slopes over {B} samples = {np.mean(b1_estimates):.3f}")

True β1 = 1.50
Mean of estimated slopes over 1000 samples = 1.504

# Q1. Put your answer here

# Step 1. Define your own function to calculate the OLS estimate

# Step 2. Apply your function to the study hour and test score data

# Step 3. Compare your results with statsmodels' results

# Q2. Put your answer here


# Step 1. Simulate B=1000 datasets, each with n=200 observations, from the nonlinear DGP
# y = β0 + β1*x + β2*x^2 + u
# with β0=2, β1=1.5, β2=0.5, and u~N(0, 2^2)

# Step 2. Create an empty list to store the slope estimates from each dataset

# Step 3. For each dataset, fit a linear regression of y on x (ignoring the x^2 term) and store the slope estimate

# Step 4. Calculate and report the mean of the slope estimates across the 1000 datasets

# Step 5. Compare the mean slope estimate to the true β1=1.5

# **************** You don't need to modify anything in this cell. *****************
# 🔎 Mini demo — Violate SLR.2 — Non-random sampling (select only low-x values)

np.random.seed(320)

# True DGP (all assumptions hold)
n = 500
beta0, beta1 = 1.0, 2.0
x = np.random.uniform(0, 10, n)
u = np.random.normal(0, 1, n)
y = beta0 + beta1*x + u

# OLS with full random sample
b0_full, b1_full = ols_ky_manual(y, x)

# --- Break SLR.2: select only low-x observations ---
mask = x < 1       # only people with smoke less than 1 pack/day report their smoking habits
x_biased = x[mask]
y_biased = y[mask]

b0_bias, b1_bias = ols_ky_manual(y_biased, x_biased)

print(f"True β1 = {beta1}")
print(f"OLS slope (full random sample): b1 = {b1_full:.3f}")
print(f"OLS slope (biased sample, only x<3): b1 = {b1_bias:.3f}")

# Visualize
plt.figure(figsize=(6,4))
plt.scatter(x, y, s=12, alpha=0.4, label="Original data")
plt.scatter(x_biased, y_biased, s=40, alpha=0.7, color="red", label="Selected sample (x<3)")
xs = np.linspace(0, 10, 100)
plt.plot(xs, b0_full + b1_full*xs, color="blue", label="Full-sample OLS")
plt.plot(xs, b0_bias + b1_bias*xs, color="red", linestyle="--", label="Biased-sample OLS")
plt.legend()
plt.title("SLR.2 violation: non-random (biased) sampling")
plt.xlabel("x"); plt.ylabel("y")
plt.show()

True β1 = 2.0
OLS slope (full random sample): b1 = 2.002
OLS slope (biased sample, only x<3): b1 = 1.707

# **************** You don't need to modify anything in this cell. *****************
# 🔎 Mini demo — violate SLR.3 with no variation in x

n = 100
x = np.ones(n)  # ❌ No variation in x (all values are the same, which is 1)
u = np.random.normal(0, 1.0, size=n)
y = 2.0 + 3.0*x + u

b0, b1 = ols_ky_manual(y, x)
print(f"Estimated coefficients: b0={b0}, b1={b1}")
print("********Notice: b1 is NaN (undefined) because Var(x)=0 → cannot divide by zero in slope formula.********")

Estimated coefficients: b0=nan, b1=nan
********Notice: b1 is NaN (undefined) because Var(x)=0 → cannot divide by zero in slope formula.********

/var/folders/c7/4b0dlrp54sj1y71v5yq8xcw00000gn/T/ipykernel_36343/3817906853.py:15: RuntimeWarning: invalid value encountered in scalar divide
  beta1_hat = ((x - x_mean) * (y - y_mean)).sum() / ((x - x_mean)**2).sum()

# **************** You don't need to modify anything in this cell. *****************
# 🔎 Mini demo — violate SLR.4 with omitted variable (education & wages example)

np.random.seed(320)

n = 500
beta0, beta1, beta2 = 10.0, 2.0, 5.0   # true model: wage = 10 + 2*educ + 5*ability + u

# Simulate data
ability = np.random.normal(0, 1, n)                # latent ability z
education = 12 + 2*ability + np.random.normal(0, 1, n)  # education correlated with ability
u = np.random.normal(0, 2, n)
wage = beta0 + beta1*education + beta2*ability + u

# Case 1: Full model (correct specification, includes ability)
X_full = sm.add_constant(np.column_stack([education, ability]))
model_full = sm.OLS(wage, X_full).fit()

# Case 2: Omitted variable model (only education, omit ability)
X_omit = sm.add_constant(education)
model_omit = sm.OLS(wage, X_omit).fit()

print("True β1 (education effect) =", beta1)
print(f"Estimated β1 (full model, controls for ability): {model_full.params[1]:.3f}")
print(f"Estimated β1 (omit ability): {model_omit.params[1]:.3f}")

print("\nNotice: When ability is omitted, β1 is biased upward, "
      "since ability is positively correlated with education.")

ECON 320 Lab Exercise : Week 3 — OLS & SLR Assumptions¶

Submit this notebook as an HTML/PDF on Canvas.¶

🧪 Connection to Lecture¶

🎯 Learning Goals¶

🏁 Warm-up : Baseline DGP — when SLR.1–SLR.4 hold¶

Tasks (2 required questions in total)¶

❓ Q1. (Required) Define a function to calculate the OLS estimate¶

1️⃣ Violate SLR.1 — Nonlinearity (wrong functional form)¶

❓ Q2. (Required) Simulate data and estimate OLS under nonlinearity¶

(Optional) Demonstrations of other violations of SLR assumptions (SLR.2–SLR.4)¶

2️⃣ Violate SLR.2 — Non-random sampling¶

3️⃣ Violate SLR.3 — No variation in $x$¶

4️⃣ Violate SLR.4 — Exogeneity fails (omitted variable)¶

✅ Wrap-up¶