# Let's install and import the required libraries together!
!pip install numpy pandas statsmodels matplotlib --quiet

import numpy as np
import numpy.random as rng
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Manual OLS calculation
def ols_manual(y, x):

    # Calculate means
    xbar = np.mean(x) # or sum(x)/len(x)
    ybar = np.mean(y) # or sum(y)/len(y)

    # Calculate slope (b1) and intercept (b0)
    b1 = np.sum((x - xbar) * (y - ybar))/np.sum((x - xbar) ** 2) # or covariance(x,y)/variance(x) with np.cov and np.var
    b0 = ybar - b1 * xbar
    return b0, b1

# Package OLS calculation
def ols_sm(y, X):
    X = sm.add_constant(X)
    return sm.OLS(y, X).fit()

# Assign our Sample data
x = np.array([2, 3, 4, 5, 6])
y = np.array([50, 60, 80, 70, 90])

# Fit models on our sample data
b0_manual, b1_manual = ols_manual(y, x)
model_sm = ols_sm(y, x)
b0_sm, b1_sm = model_sm.params
# Display results
print(f"Manual OLS: Intercept = {b0_manual}, Slope = {b1_manual}")
print(f"Package OLS: Intercept = {b0_sm}, Slope = {b1_sm}")

#Oops, it doesn't look right. Let's de-bug it together!

Manual OLS: Intercept = 34.0, Slope = 9.0
Package OLS: Intercept = 33.99999999999998, Slope = 9.000000000000009

# Manual OLS fit (already computed)
yhat_manual = b0_manual + b1_manual * x
resid_manual = y - yhat_manual
mse_manual = np.mean(resid_manual**2)
print(f"MSE (Manual OLS) = {mse_manual:.3f}")

# Plot with residuals
plt.scatter(x, y, color='blue', label='Data points')
plt.plot(x, yhat_manual, color='red', label='Manual OLS')

# draw residual lines
for xi, yi, yhat in zip(x, y, yhat_manual):
    plt.vlines(xi, yi, yhat, color='gray', linestyle=':', alpha=0.6)

plt.legend()
plt.title("Manual OLS fit with residuals")
plt.show()

MSE (Manual OLS) = 38.000

# --- Try an different alternative lines ---

# Alternative line parameters
b0_alt = 4 
b1_alt = 12

# Calculate predictions, residuals, and MSE for alternative line
yhat_alt = b0_alt + b1_alt * x
resid_alt = y - yhat_alt
mse_alt = np.mean(resid_alt**2)
print(f"MSE (Alternative line) = {mse_alt:.3f}")

# Plot alternative line with residuals
plt.scatter(x, y, color='blue', label='Data points')
plt.plot(x, yhat_alt, color='orange', label=f'Alternative line')

# draw residual lines for alternative
for xi, yi, yhat in zip(x, y, yhat_alt):
    plt.vlines(xi, yi, yhat, color='gray', linestyle=':', alpha=0.6)

plt.legend()
plt.title("Alternative line with residuals")
plt.show()

MSE (Alternative line) = 380.000

# Construct a Baseline DGP where all assumptions hold (SLR.1–SLR.4)

# Setting 1: sample size
n = 200

# Setting 2: true parameters
beta0_true, beta1_true = 2.0, 1.5
sigma = 2

# Setting 3: ****** define DGP *******
x = np.random.uniform(-3, 3, size=n) #(this ensures SLR.3 and SLR.2 hold)
u = np.random.normal(0, sigma, size=n) # (this ensures SLR.4 holds)
y = beta0_true + beta1_true*x + u   # (this ensures SLR.1 and SLR.2 hold)

# this gives **one** sample set of n observations from the DGP

# let's plot it
b0_m, b1_m = ols_manual(y, x)                    # fit OLS on this sample data (Oops, it doesn't look right. Let's de-bug it together!)
plt.scatter(x, y, alpha=0.6, label="Data points")
xg = np.linspace(x.min(), x.max(), 100)
plt.plot(xg, b0_m + b1_m*xg, color='red', label="OLS fit")
plt.title(f"Baseline OLS fit: b0={b0_m:.2f}, b1={b1_m:.2f}") # compare to true values beta0_true, beta1_true
plt.xlabel("x"); plt.ylabel("y")
plt.legend()
plt.show()

# Monte Carlo: check mean of many slope estimates

# Setting 1: number of random samples (lets you see the unbiasedness in action via LLN (the average estimate converges to truth))
B = 1000

# Setting 2: sample size per dataset (affects the precision (variance) of each estimate.)
n = 200

# Setting 3: true parameters
beta0_true, beta1_true = 2.0, 1.5
sigma = 2.0

# Create a empty list to store slope estimates
b1_estimates = []

# Simulate B datasets
# Repeat B times: generate data, fit OLS, store slope estimate
for _ in range(B):

    # x is randomly sampled from uniform(-3, 3)
    x = rng.uniform(-3, 3, size=n)

    # u is randomly sampled from normal(0, sigma)
    u = rng.normal(0, sigma, size=n)

    # y is generated from the DGP
    y = beta0_true + beta1_true*x + u

    # Fit OLS and store the slope estimate
    b0, b1 = ols_manual(y, x)

    # Append the slope estimate to the list
    b1_estimates.append(b1)

print(f"True β1 = {beta1_true:.2f}")
print(f"Mean of estimated slopes over {B} samples = {np.mean(b1_estimates):.3f}")

True β1 = 1.50
Mean of estimated slopes over 1000 samples = 1.507

Lecture 3: OLS Estimator for Simple Linear Regression (SLR)¶

(Completed version)¶

📐 Model vs. Estimator — SLR vs. OLS¶

📦 Required libraries¶

Part A — OLS Refresher¶

Test it out: Manual OLS vs. Package OLS¶

📝 Try it yourself!¶

Part B - Data Generating Process (DGP) & Numerical Evidence for Theorems¶

🤔 How do we know OLS is any good?¶

🔢 Numerical evidence via simulation¶

Part C — Assumptions for Simple Linear Regression (SLR) and Unbiasedness of OLS¶

📖 Recall: Theorem 2.1 (Unbiasedness of OLS)¶

🏫 In today's lecture¶

🧪 In Lab Exercise (for you to try)¶

Step 1. Construct Baseline DGP (all assumptions hold)¶

📖 Assumptions for SLR (recap)¶

Step 2. Simulation check¶

🔁 From one sample to many samples¶

🔄 Quick Recap¶

References & Acknowledgments¶