# Install quietly if needed
!pip install numpy pandas statsmodels matplotlib --quiet

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

rng = np.random.default_rng(2024)

# Simulate/Generate the tiny toy dataset according to the DGP (put it in a function for easy re-use)

def generate_df(n, sigma):
    educ  = rng.integers(10, 25, size=n)           # x1
    exper = rng.integers(0, 50,  size=n)           # x2
    educ_noisy = educ + rng.normal(0, 0.3, size=n) # x3 ~ educ + small noise (random number from N(0, 0.3^2))
    eps = rng.normal(0, sigma, size=n)
    y = 5 + 1.2*educ + 0.8*exper + eps

    df = pd.DataFrame({"y": y, "educ": educ, "exper": exper, "educ_noisy": educ_noisy})
    return df

# Generate a dataset
df = generate_df(n=120, sigma=2.0)

# Show the first few rows
df.head()

# Compute the correlation matrix between the independent variables

corr_matrix = df[["educ", "exper", "educ_noisy"]].corr()
print("Correlation Matrix:")
print(corr_matrix)

Correlation Matrix:
                educ     exper  educ_noisy
educ        1.000000 -0.077422    0.997485
exper      -0.077422  1.000000   -0.064218
educ_noisy  0.997485 -0.064218    1.000000

# Visualize the results in a heatmap

plt.figure(figsize=(5,4)) # set figure size
plt.imshow(corr_matrix, cmap="coolwarm", vmin=-1, vmax=1)

plt.xticks(range(corr_matrix.shape[1]), corr_matrix.columns) # x-axis labels
plt.yticks(range(corr_matrix.shape[0]), corr_matrix.index) # y-axis labels

for i in range(corr_matrix.shape[0]): # loop over rows
    for j in range(corr_matrix.shape[1]): # loop over columns
        plt.text(j, i, f"{corr_matrix.values[i,j]:.3f}", ha="center", va="center", fontsize=9)

plt.title("Correlation matrix")
plt.colorbar()
plt.tight_layout()
plt.show()

# Plot the scatter of educ vs educ_noisy

plt.figure(figsize=(5,4))
plt.scatter(df["educ"], df["educ_noisy"], alpha=0.7)
plt.xlabel("educ (x1)")
plt.ylabel("educ_noisy (x3)")
plt.title("Near collinearity: x3 ≈ x1 + small noise")
plt.show()

# Model 1
X1 = sm.add_constant(df[["educ","exper"]])
m1 = sm.OLS(df["y"], X1).fit()

# Model 2 (adding near-duplicate x3: educ_noisy)
X2 = sm.add_constant(df[["educ","exper","educ_noisy"]])
m2 = sm.OLS(df["y"], X2).fit()

# Display the results
print("Model 1 (no duplicate):")
print(m1.summary().tables[1])
print("\nModel 2 (with near-duplicate x3):")
print(m2.summary().tables[1])

# Compare R^2 values
print("\nR^2 comparison:  M1 =", round(m1.rsquared,4), "  M2 =", round(m2.rsquared,4))

Model 1 (no duplicate):
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.8864      0.799      6.112      0.000       3.303       6.470
educ           1.2061      0.042     28.482      0.000       1.122       1.290
exper          0.7963      0.013     60.337      0.000       0.770       0.822
==============================================================================

Model 2 (with near-duplicate x3):
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7616      0.818      5.822      0.000       3.142       6.382
educ           1.6628      0.608      2.736      0.007       0.459       2.866
exper          0.7981      0.013     59.335      0.000       0.771       0.825
educ_noisy    -0.4520      0.600     -0.753      0.453      -1.640       0.736
==============================================================================

R^2 comparison:  M1 = 0.973   M2 = 0.9731

# VIFs
X_vif = X2.values
vifs = [variance_inflation_factor(X_vif, i) for i in range(X_vif.shape[1])]

pd.DataFrame({
    "Variable": X2.columns[1:],   # skip const
    "VIF": vifs[1:]              # skip const
})

Variable	Symbol	Description
Education	$x_1$	Years of education
Experience	$x_2$	Years of experience
Noisy education	$x_3$	A slightly noisy copy of education: $x_3 = x_1 + \text{small noise}$ → highly collinear with $x_1$

	y	educ	exper	educ_noisy
0	40.679646	13	24	13.055783
1	52.964632	20	29	20.053208
2	39.096500	11	27	11.121521
3	35.202449	13	21	13.007568
4	46.015203	14	27	13.465139

Lecture 6: Multicollinearity (Completed version)¶

🌱 Warm-up — Connecting to last week¶

Step-by-step: where multicollinearity appears¶

Two types of multicollinearity¶

📦 Required libraries¶

🎯 Tiny Toy Dataset — used in all sections¶

1) First look: correlations and scatter plot between variables¶

🔹 Pairwise correlations are a quick way to spot potential multicollinearity.¶

🔹 Scatter plot of $x_1$ vs $x_3$:¶

2) OLS under Multicollinearity: Same Fit, Less Precision (Larger Standard Errors)¶

📐 Reminder: What does $R^2$ measure?¶

⚠️ Why multicollinearity is subtle¶

Instability: small changes → big swings¶

3) Formal Diagnostics: $R_j^2$ and Variance Inflation Factor (VIF)¶

🔹 Pairwise correlation (recap)¶

🔹 The idea of $R_j^2$¶

🔹 The link to VIF¶

5) Some suggested remedies in practice¶

Wrap‑up¶

References & Acknowledgments¶

	Variable	VIF
0	educ	206.388682
1	exper	1.041315
2	educ_noisy	206.001095