import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

data = loadmat("Data_Files/peanuts.mat")
X = data["X"].reshape(-1, 1)
Y = data["Y"].reshape(-1, 1)

# Scatterplot
plt.figure(figsize=(8, 4))
plt.scatter(X, Y, color='black', facecolors='none', label='Data')
plt.xlabel("Aflatoxin (X)")
plt.ylabel("Percentage of noncontaminated peanuts (Y)")
plt.title("Peanuts Data Scatterplot")
plt.legend()
plt.grid(True)
plt.show()

# Cross-validation 
kf = KFold(n_splits=5, shuffle=True, random_state=1905)

degrees = [1, 2, 3, 4, 5]

plt.figure(figsize=(8, 4))
plt.scatter(X, Y, color='black', facecolors='none', label='Data')
X_seq = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
best_mse = float('inf')
best_degree = 0

for d in degrees:
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=d, include_bias=False)),
        ("linreg", LinearRegression())
    ])
    
    # convert negative MSE values to positive
    mse = -cross_val_score(
        model, X, Y,
        scoring="neg_mean_squared_error",
        cv=kf
    ).mean()

    print(f"Degree {d}: MSE = {mse:.6f}")

    if mse < best_mse:
        best_mse = mse
        best_degree = d
    
    # fit on whole dataset for plotting
    model.fit(X, Y)
    plt.plot(X_seq, model.predict(X_seq), label=f'Degree {d}')


print(f"\n Best Model: Polynomial Degree {best_degree}")
plt.title('Polynomial Model Fits')
plt.xlabel('Alfatoxin (X)')
plt.ylabel('Percentage of noncontaminated peanuts (Y)')
plt.legend()
plt.grid(True)
plt.show()

Degree 1: MSE = 0.001687
Degree 2: MSE = 0.001672
Degree 3: MSE = 0.001845
Degree 4: MSE = 0.002937
Degree 5: MSE = 0.006057

 Best Model: Polynomial Degree 2

# Parameters
MC_trials = 500          # number of Monte Carlo trials
n = 100                  # sample size
B = 1000                 # bootstrap resamples

boot_bias = []
boot_se = []
jack_bias = []
jack_se = []

# true second central moment of N(0,1)
true_var = 1.0

for _ in range(MC_trials):
    # generate data
    x = np.random.randn(n)
    
    # sample second central moment
    theta_hat = np.mean((x - np.mean(x))**2)
    
    # ---------- Bootstrap ----------
    boot_thetas = []
    for _ in range(B):
        xb = np.random.choice(x, size=n, replace=True)
        boot_thetas.append(np.mean((xb - np.mean(xb))**2))
    boot_thetas = np.array(boot_thetas)
    
    # Bootstrap estimates
    boot_bias.append(np.mean(boot_thetas) - theta_hat)
    boot_se.append(np.std(boot_thetas, ddof=1))
    
    # ---------- Jackknife ----------
    jack_thetas = []
    for i in range(n):
        x_leave = np.delete(x, i)
        jack_thetas.append(np.mean((x_leave - np.mean(x_leave))**2))
    jack_thetas = np.array(jack_thetas)
    
    theta_jack_mean = np.mean(jack_thetas)
    
    # Jackknife estimates
    jack_bias.append((n - 1) * (theta_jack_mean - theta_hat))
    jack_se.append(
        np.sqrt((n - 1) / n * np.sum((jack_thetas - theta_jack_mean) ** 2))
    )

boot_bias = np.array(boot_bias)
boot_se = np.array(boot_se)
jack_bias = np.array(jack_bias)
jack_se = np.array(jack_se)

# ---------- Plots ----------
plt.figure(figsize=(14, 10))

# Histograms of Bias
plt.subplot(2, 2, 1)
plt.hist(boot_bias, bins=30, alpha=0.5, label="Bootstrap Bias", color='orange', edgecolor='black')
plt.hist(jack_bias, bins=30, alpha=0.5, label="Jackknife Bias", color='blue', edgecolor='black')
plt.title("Distribution of Bias Estimates")
plt.xlabel('Estimated Bias')
plt.legend()

# Histograms of SE
plt.subplot(2, 2, 2)
plt.hist(boot_se, bins=30, alpha=0.5, label="Bootstrap SE", color='orange', edgecolor='black')
plt.hist(jack_se, bins=30, alpha=0.5, label="Jackknife SE", color='blue', edgecolor='black')
plt.title("Distribution of SE Estimates")
plt.xlabel('Estimated SE')
plt.legend()

# Boxplots of Bias
plt.subplot(2, 2, 3)
plt.boxplot([jack_bias, boot_bias], labels=['Jackknife', 'Bootstrap'])
plt.title('Comparison of Bias Estimates')
plt.ylabel('Bias')
plt.grid(True, linestyle='--', alpha=0.3)

# Boxplots of SE
plt.subplot(2, 2, 4)
plt.boxplot([jack_se, boot_se], labels=['Jackknife', 'Bootstrap'])
plt.title('Comparison of SE Estimates')
plt.ylabel('Standard Error')
plt.grid(True, linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

# ---------- Numerical summaries ----------
print(f"--- Summary Statistics (M={MC_trials} trials) ---")
print(f"Jackknife Mean Bias: {np.mean(jack_bias):.5f} (SD: {np.std(jack_bias):.5f})")
print(f"Bootstrap Mean Bias: {np.mean(boot_bias):.5f} (SD: {np.std(boot_bias):.5f})")
print("-" * 40)
print(f"Jackknife Mean SE:   {np.mean(jack_se):.5f} (SD: {np.std(jack_se):.5f})")
print(f"Bootstrap Mean SE:   {np.mean(boot_se):.5f} (SD: {np.std(boot_se):.5f})")

/var/folders/tm/0v_xzs6s5vzfw60jkby5vd980000gn/T/ipykernel_15810/2986901424.py:73: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([jack_bias, boot_bias], labels=['Jackknife', 'Bootstrap'])
/var/folders/tm/0v_xzs6s5vzfw60jkby5vd980000gn/T/ipykernel_15810/2986901424.py:80: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([jack_se, boot_se], labels=['Jackknife', 'Bootstrap'])

--- Summary Statistics (M=500 trials) ---
Jackknife Mean Bias: -0.00994 (SD: 0.00151)
Bootstrap Mean Bias: -0.00967 (SD: 0.00469)
----------------------------------------
Jackknife Mean SE:   0.13870 (SD: 0.02681)
Bootstrap Mean SE:   0.13584 (SD: 0.02625)

data = loadmat("Data_Files/law.mat")

X = data['law']  # Shape is (15, 2)

def analyze_variable(data_vector, label):
    n = len(data_vector)

    # ---------- Jackknife ----------
    jack_medians = []

    for i in range(n):
        # create dataset with i-th observation removed
        x_leave = np.delete(x, i)
        jack_medians.append(np.median(x_leave))

    jack_medians = np.array(jack_medians)

    # num of distinct jackknife medians
    num_distinct = len(np.unique(jack_medians))

    # Jackknife SE
    jack_mean = np.mean(jack_medians)
    jack_se = np.sqrt((n - 1) / n * np.sum((jack_medians - jack_mean) ** 2))

    # ---------- Bootstrap ----------
    B = 1000
    boot_medians = []

    for _ in range(B):
        # resample with replacement
        xb = np.random.choice(x, size=n, replace=True)
        boot_medians.append(np.median(xb))

    boot_medians = np.array(boot_medians)
    boot_se = np.std(boot_medians, ddof=1)

    # ---------- Plots ----------
    plt.figure()
    plt.hist(jack_medians, bins=15)
    plt.title(f"Jackknife Replicates of the Median for {label}")
    plt.xlabel("Median")
    plt.ylabel("Frequency")
    plt.show()

    plt.figure()
    plt.hist(boot_medians, bins=30)
    plt.title(f"Bootstrap Replicates of the Median for {label}")
    plt.xlabel("Median")
    plt.ylabel("Frequency")
    plt.show()

    # ---------- Output ----------
    print("Number of observations:", n)
    print("Number of distinct jackknife medians:", num_distinct)
    print(f"Jackknife SE of median: {jack_se:.5f}")
    print(f"Bootstrap SE of median: {boot_se:.5f}")

# run for both variables (assuming Col 1 is LSAT, Col 2 is GPA)
analyze_variable(X[:, 0], "Variable 1 (e.g., LSAT)")
analyze_variable(X[:, 1], "Variable 2 (e.g., GPA)")

Number of observations: 15
Number of distinct jackknife medians: 2
Jackknife SE of median: 0.02439
Bootstrap SE of median: 0.26446

Number of observations: 15
Number of distinct jackknife medians: 2
Jackknife SE of median: 0.02439
Bootstrap SE of median: 0.25740

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

np.random.seed(1905)

# generate predictor
n = 200
x = np.linspace(-2, 2, n)
X = x.reshape(-1, 1)

# generate response
epsilon = np.random.normal(0, 5, size=n)
y = 4*x**3 + 6*x**2 - 1 + epsilon

# degrees to fit
degrees = [1, 2, 3]

# create subplots: 3 rows (degrees) x 3 columns (plots)
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
plt.subplots_adjust(hspace=0.4, wspace=0.3)

for i, d in enumerate(degrees):
    # Fit polynomial model
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=d, include_bias=False)),
        ("linreg", LinearRegression())
    ])
    model.fit(X, y)
    
    # predict and calculate residuals
    y_hat = model.predict(X)
    residuals = y - y_hat

    # row for the current degree
    ax_scatter = axes[i, 0]
    ax_hist = axes[i, 1]
    ax_box = axes[i, 2]

    # ---------- Residual dependence plot ----------
    ax_scatter.scatter(x, residuals, color='blue', alpha=0.6)
    ax_scatter.axhline(0, color='red', linestyle='--')
    ax_scatter.set_title(f'Degree {d}: Residuals vs x')
    ax_scatter.set_xlabel('x')
    ax_scatter.set_ylabel('Residuals')

    # ---------- Histogram ----------
    # checks for normality
    ax_hist.hist(residuals, bins=30, color='green', edgecolor='black', alpha=0.7)
    ax_hist.set_title(f'Degree {d}: Histogram of Residuals')
    ax_hist.set_xlabel('Residuals')
    
    # ---------- Boxplot ----------
    # Checks for symmetry and outliers
    ax_box.boxplot(residuals, vert=False)
    ax_box.set_title(f'Degree {d}: Box Plot')
    ax_box.set_xlabel('Residuals')

plt.suptitle('Residual Analysis for Polynomial Models (d=1, 2, 3)', fontsize=16)
plt.show()

import statsmodels.api as sm
import statsmodels.genmod.families.links as links

data = loadmat('Data_Files/wais.mat')
wais_matrix = data['wais']
df = pd.DataFrame(wais_matrix, columns=['WAIS_Score', 'Senile_Status'])

# define X and y
y = df['Senile_Status']
X = df['WAIS_Score']
X = sm.add_constant(X)  # add intercept term

# -------- Probit Model ------------
model_probit = sm.GLM(y, X, family=sm.families.Binomial(link=sm.families.links.Probit()))
res_probit = model_probit.fit()

# -------- Complementary Log-Log Model -----------
model_cloglog = sm.GLM(y, X, family=sm.families.Binomial(link=sm.families.links.cloglog()))
res_cloglog = model_cloglog.fit()

# -------- Logistic Regression (for comparison) -----------
model_logit = sm.GLM(y, X, family=sm.families.Binomial(link=sm.families.links.Logit()))
res_logit = model_logit.fit()

# compare Results
print(f"{'Model':<15} | {'AIC':<10} | {'BIC':<10} | {'Log-Likelihood':<15}")
print("-" * 60)
print(f"{'Probit':<15} | {res_probit.aic:<10.4f} | {res_probit.bic_llf:<10.4f} | {res_probit.llf:<15.4f}")
print(f"{'CLogLog':<15} | {res_cloglog.aic:<10.4f} | {res_cloglog.bic_llf:<10.4f} | {res_cloglog.llf:<15.4f}")
print(f"{'Logistic':<15} | {res_logit.aic:<10.4f} | {res_logit.bic_llf:<10.4f} | {res_logit.llf:<15.4f}")

print("\n------------------------ Probit Model Summary ------------------------\n")
print(res_probit.summary())

Model           | AIC        | BIC        | Log-Likelihood 
------------------------------------------------------------
Probit          | 54.9836    | 58.9615    | -25.4918       
CLogLog         | 55.3107    | 59.2887    | -25.6554       
Logistic        | 55.0174    | 58.9953    | -25.5087       

------------------------ Probit Model Summary ------------------------

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:          Senile_Status   No. Observations:                   54
Model:                            GLM   Df Residuals:                       52
Model Family:                Binomial   Df Model:                            1
Link Function:                 Probit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -25.492
Date:                Wed, 17 Dec 2025   Deviance:                       50.984
Time:                        16:45:03   Pearson chi2:                     51.2
No. Iterations:                     5   Pseudo R-squ. (CS):             0.1816
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.3862      0.685      2.023      0.043       0.043       2.729
WAIS_Score    -0.1880      0.063     -2.984      0.003      -0.312      -0.065
==============================================================================

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/statsmodels/genmod/families/links.py:13: FutureWarning: The cloglog link alias is deprecated. Use CLogLog instead. The cloglog link alias will be removed after the 0.15.0 release.
  warnings.warn(

import statsmodels.api as sm

data = loadmat('Data_Files/crab.mat')
X_mat = data['X']               
sat_counts = data['satellites'] 

width = X_mat[:, 0]
# create binary response: 1 if satellites > 0, else 0
y = (sat_counts > 0).astype(int).flatten()

# prepare design matrix (add intercept)
X = sm.add_constant(width)

# fit models
# compare Logit, Probit, and CLogLog
models = {
    'Logit': sm.families.links.Logit(),
    'Probit': sm.families.links.Probit(),
    'CLogLog': sm.families.links.cloglog()
}

results = {}
colors = ['blue', 'red', 'green']
styles = ['-', '--', '-.']

print("--- Model Assessment Results ---")
print(f"{'Model':<10} | {'AIC':<10} | {'BIC':<10} | {'Log-Likelihood':<15}")
print("-" * 55)

plt.figure(figsize=(10, 6))
# plot observed data
plt.scatter(width, y + np.random.normal(0, 0.02, len(y)), 
            color='black', alpha=0.3, marker='o', label='Observed (Jittered)')

# generate prediction grid
x_plot = np.linspace(width.min(), width.max(), 100)
x_plot_const = sm.add_constant(x_plot)

for i, (name, link) in enumerate(models.items()):
    # Fit GLM
    model = sm.GLM(y, X, family=sm.families.Binomial(link=link))
    res = model.fit()
    results[name] = res
    
    print(f"{name:<10} | {res.aic:<10.4f} | {res.bic_llf:<10.4f} | {res.llf:<15.4f}")
    
    y_pred = res.predict(x_plot_const)
    plt.plot(x_plot, y_pred, label=f'{name} (AIC: {res.aic:.1f})', 
             color=colors[i], linestyle=styles[i], linewidth=2)

plt.title('Probability of Satellites vs Carapace Width')
plt.xlabel('Carapace Width (cm)')
plt.ylabel('Probability of having Satellites (Y=1)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# best model 
best_model_name = min(results, key=lambda k: results[k].aic)
print(f"\nBest Model based on AIC: {best_model_name}")

--- Model Assessment Results ---
Model      | AIC        | BIC        | Log-Likelihood 
-------------------------------------------------------
Logit      | 198.4527   | 204.7592   | -97.2263       
Probit     | 198.0357   | 204.3423   | -97.0179       
CLogLog    | 197.2753   | 203.5818   | -96.6376

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/statsmodels/genmod/families/links.py:13: FutureWarning: The cloglog link alias is deprecated. Use CLogLog instead. The cloglog link alias will be removed after the 0.15.0 release.
  warnings.warn(

Best Model based on AIC: CLogLog

import statsmodels.api as sm

data = loadmat('Data_Files/cpunish.mat')
y = data['y']  # response: # of executions
X = data['X']  # predictors

X_const = sm.add_constant(X)

feature_names = [
    'Intercept', 
    'Median Income', 
    'Poverty %', 
    'African-Amer %', 
    'Log(Violent Crime)', 
    'South', 
    'College Degree %'
]

# fit GLM 
model = sm.GLM(y, X_const, family=sm.families.Poisson(link=sm.families.links.log()))
result = model.fit()

deviance = result.deviance
params = result.params        # estimated coefficients
bse = result.bse              # standard errors
conf_int = result.conf_int()  # 95% Wald confidence intervals

print(f"Deviance: {deviance:.4f}\n")
print(f"{'Variable':<20} | {'Coeff':<10} | {'Std. Err':<10} | {'95% CI Lower':<12} | {'95% CI Upper':<12}")
print("-" * 75)

for i, name in enumerate(feature_names):
    lower, upper = conf_int[i]
    print(f"{name:<20} | {params[i]:<10.4f} | {bse[i]:<10.4f} | {lower:<12.4f} | {upper:<12.4f}")

Deviance: 18.5916

Variable             | Coeff      | Std. Err   | 95% CI Lower | 95% CI Upper
---------------------------------------------------------------------------
Intercept            | -6.8015    | 4.1469     | -14.9292     | 1.3262      
Median Income        | 0.0003     | 0.0001     | 0.0002       | 0.0004      
Poverty %            | 0.0778     | 0.0794     | -0.0778      | 0.2334      
African-Amer %       | -0.0949    | 0.0229     | -0.1399      | -0.0500     
Log(Violent Crime)   | 0.2969     | 0.4375     | -0.5606      | 1.1545      
South                | 2.3012     | 0.4284     | 1.4616       | 3.1408      
College Degree %     | -18.7221   | 4.2840     | -27.1185     | -10.3256

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/statsmodels/genmod/families/links.py:13: FutureWarning: The log link alias is deprecated. Use Log instead. The log link alias will be removed after the 0.15.0 release.
  warnings.warn(

from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import itertools

data = loadmat('Data_Files/airfoil.mat')

# predictors: Freq, Angle, Chord, Veloc, Suction
# response: Sound
df = pd.DataFrame({
    'Freq': data['Freq'].flatten(),
    'Angle': data['Angle'].flatten(),
    'Chord': data['Chord'].flatten(),
    'Veloc': data['Veloc'].flatten(),
    'Suction': data['Suction'].flatten(),
    'Sound': data['Sound'].flatten()
})

X = df.drop(columns=['Sound'])
y = df['Sound']

print(f"Dataset Shape: {df.shape}")

# preprocessing
# scale features for penalized regression 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# split data (70% Train, 30% Test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.3, random_state=42)
X_train_orig, X_test_orig, _, _ = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Ridge Regression ---
alphas_ridge = np.logspace(-4, 4, 100)
ridge = RidgeCV(alphas=alphas_ridge, cv=5)
ridge.fit(X_train, y_train)
mse_ridge = mean_squared_error(y_test, ridge.predict(X_test))

print(f"\nRidge Regression:")
print(f"  Best Alpha: {ridge.alpha_:.4f}")
print(f"  MSE: {mse_ridge:.4f}")
print(f"  Coefficients: {ridge.coef_}")

# --- Lasso Regression ---
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)
mse_lasso = mean_squared_error(y_test, lasso.predict(X_test))

print(f"\nLasso Regression:")
print(f"  Best Alpha: {lasso.alpha_:.4f}")
print(f"  MSE: {mse_lasso:.4f}")
print(f"  Coefficients: {lasso.coef_}")

# --- Elastic Net Regression ---
enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=5, random_state=42)
enet.fit(X_train, y_train)
mse_enet = mean_squared_error(y_test, enet.predict(X_test))

print(f"\nElastic Net Regression:")
print(f"  Best Alpha: {enet.alpha_:.4f}")
print(f"  MSE: {mse_enet:.4f}")
print(f"  Coefficients: {enet.coef_}")

# --- Best Subset Selection ---
best_mse_subset = float('inf')
best_subset_features = []

for k in range(1, len(X.columns) + 1):
    for features in itertools.combinations(X.columns, k):
        features = list(features)
        
        model = LinearRegression()
        model.fit(X_train_orig[features], y_train)
        
        y_pred = model.predict(X_test_orig[features])
        mse = mean_squared_error(y_test, y_pred)
        
        if mse < best_mse_subset:
            best_mse_subset = mse
            best_subset_features = features

print(f"\nBest Subset Selection:")
print(f"  Best Size: {len(best_subset_features)}")
print(f"  Selected Features: {best_subset_features}")
print(f"  MSE: {best_mse_subset:.4f}")

print("\n----- Final Comparison -----")
print(f"{'Method':<15} | {'MSE':<10}")
print("-" * 30)
print(f"{'Ridge':<15} | {mse_ridge:.4f}")
print(f"{'Lasso':<15} | {mse_lasso:.4f}")
print(f"{'ElasticNet':<15} | {mse_enet:.4f}")
print(f"{'Best Subset':<15} | {best_mse_subset:.4f}")

Dataset Shape: (1503, 6)

Ridge Regression:
  Best Alpha: 7.0548
  MSE: 23.7296
  Coefficients: [-3.97042665 -2.10186543 -3.18850678  1.53761334 -2.07640231]

Lasso Regression:
  Best Alpha: 0.0030
  MSE: 23.6948
  Coefficients: [-4.00496626 -2.15110265 -3.22942077  1.5523164  -2.06786733]

Elastic Net Regression:
  Best Alpha: 0.0069
  MSE: 23.7175
  Coefficients: [-3.98330377 -2.11877692 -3.2030425   1.54179897 -2.07336023]

Best Subset Selection:
  Best Size: 5
  Selected Features: ['Freq', 'Angle', 'Chord', 'Veloc', 'Suction']
  MSE: 23.6879

----- Final Comparison -----
Method          | MSE       
------------------------------
Ridge           | 23.7296
Lasso           | 23.6948
ElasticNet      | 23.7175
Best Subset     | 23.6879

ASSIGNMENT 6¶

Question 1¶

Scatterplot:¶

Model comparison via cross-validation:¶

Question 2¶

Question 3¶

Question 4¶

Results:¶

1) Degree 1 (Linear Model):¶

2) Degree 2 (Quadratic Model):¶

3) Degree 3 (Cubic Model):¶

Question 5¶

Data Analysis:¶

Question 6¶

Results:¶

Question 7¶

Results:¶

Question 8¶

Results Analysis:¶