import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

# generate 1000 random variables with 10 dimensions
np.random.seed(1905)
n_samples = 1000
n_dimensions = 10
X = np.random.randn(n_samples, n_dimensions)

# fit PCA on the raw data
pca = PCA(n_components=n_dimensions)
pca.fit(X)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

print("Explained variance ratio by component:")
print(explained_variance_ratio)
print("\nCumulative percentage of variance:")
print(cumulative_variance)

# construct a scree plot 
plt.figure(figsize=(10, 6))
x_axis = range(1, n_dimensions + 1)

# bar chart for individual variance
plt.bar(x_axis, explained_variance_ratio, alpha=0.6, color='skyblue', label='Individual Variance')
# line chart for cumulative variance
plt.plot(x_axis, cumulative_variance, marker='o', linestyle='-', color='red', label='Cumulative Variance')

plt.xlabel('Principal Component Index')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot for Randomly Generated Data')
plt.xticks(x_axis)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

print("Component | Individual Var | Cumulative Var")
print("-" * 40)
for i, (var, cum) in enumerate(zip(explained_variance_ratio, cumulative_variance)):
    print(f"PC {i+1:2d}     | {var:.4f}         | {cum:.4f}")

Explained variance ratio by component:
[0.12127963 0.11018128 0.10833317 0.10644528 0.10243392 0.09567009
 0.09281585 0.09019062 0.08882359 0.08382658]

Cumulative percentage of variance:
[0.12127963 0.23146091 0.33979409 0.44623936 0.54867328 0.64434337
 0.73715922 0.82734984 0.91617342 1.        ]

Component | Individual Var | Cumulative Var
----------------------------------------
PC  1     | 0.1213         | 0.1213
PC  2     | 0.1102         | 0.2315
PC  3     | 0.1083         | 0.3398
PC  4     | 0.1064         | 0.4462
PC  5     | 0.1024         | 0.5487
PC  6     | 0.0957         | 0.6443
PC  7     | 0.0928         | 0.7372
PC  8     | 0.0902         | 0.8273
PC  9     | 0.0888         | 0.9162
PC 10     | 0.0838         | 1.0000

from scipy.io import loadmat
from sklearn.decomposition import FastICA
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
from scipy.stats import mode

# load .mat file
data = loadmat("Data Files/bank.mat")

genuine = data['genuine']
forged = data['forge']

# merge data
X = np.vstack((genuine, forged))

# true Labels for comparison later (0=Genuine, 1=Forged)
y_true = np.array([0] * len(genuine) + [1] * len(forged))

# apply PPEDA with FastICA
ica = FastICA(n_components=2, random_state=42)
X_pp = ica.fit_transform(X)

# use K-Means on the PPEDA projection to find groups 
kmeans = KMeans(n_clusters=2, random_state=1905, n_init=10)
y_pred = kmeans.fit_predict(X_pp)

# K-means is a unsupervised clustering, it doesn't know "0" is "Genuine".
# We need to match the cluster label to the majority class of the first 100 samples.
first_half_mode = mode(y_pred[:len(genuine)], keepdims=True).mode[0]
if first_half_mode == 1:
    y_pred = 1 - y_pred 

acc = accuracy_score(y_true, y_pred)
conf_mat = confusion_matrix(y_true, y_pred)

print(f"Clustering Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(conf_mat)

plt.figure(figsize=(12, 5))

# plot the PPEDA projection 
plt.subplot(1, 2, 1)
plt.scatter(X_pp[:, 0], X_pp[:, 1], c=y_pred, cmap='coolwarm', edgecolors='k', s=50)
plt.title(f'PPEDA Discovery (K-Means on ICA)\nAccuracy: {acc*100:.1f}%')
plt.xlabel('Independent Component 1')
plt.ylabel('Independent Component 2')
plt.grid(True, linestyle='--', alpha=0.6)

# plot confusion matrix
plt.subplot(1, 2, 2)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Genuine', 'Predicted Forged'],
            yticklabels=['True Genuine', 'True Forged'])

plt.tight_layout()
plt.show()

Clustering Accuracy: 0.9950
Confusion Matrix:
[[ 99   1]
 [  0 100]]

from scipy.io import loadmat
from sklearn.manifold import MDS

# load .mat file
data = loadmat("Data Files/turkiye.mat")
distance_matrix = data['D']
cities_raw = data['cities']

cities = [str(item[0][0]) for item in cities_raw]

# apply Classical MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=1905, normalized_stress='auto')
coordinates = mds.fit_transform(distance_matrix)

plt.figure(figsize=(14, 8))
plt.scatter(coordinates[:, 0], coordinates[:, 1], c='red', marker='o', s=50)

for i, city in enumerate(cities):
    plt.annotate(city, (coordinates[i, 0], coordinates[i, 1]), 
                 xytext=(5, 2), textcoords='offset points', fontsize=8)

plt.title('Reconstructed Map of Turkey using MDS (Intercity Distances)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.axis('equal') 
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/manifold/_mds.py:744: FutureWarning: The default value of `n_init` will change from 4 to 1 in 1.9. To suppress this warning, provide some value of `n_init`.
  warnings.warn(
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/manifold/_mds.py:754: FutureWarning: The default value of `init` will change from 'random' to 'classical_mds' in 1.10. To suppress this warning, provide some value of `init`.
  warnings.warn(
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/manifold/_mds.py:771: FutureWarning: The `dissimilarity` parameter is deprecated and will be removed in 1.10. Use `metric` instead.
  warnings.warn(

from scipy.io import loadmat
from sklearn.decomposition import FastICA

posse_data = loadmat("Data Files/posse.mat")
keys = [k for k in posse_data.keys() if not k.startswith("__")]

print("Datasets in posse.mat:", keys)

def run_ppeda(data_matrix, title):
    X = np.array(data_matrix, dtype=float)
    
    # Standardize
    X_centered = X - X.mean(axis=0)

    # PPEDA via ICA
    ica = FastICA(n_components=2, random_state=0)
    comps = ica.fit_transform(X_centered)
    
    # Plot
    plt.figure(figsize=(6, 5))
    plt.scatter(comps[:, 0], comps[:, 1])
    plt.xlabel("ICA Component 1")
    plt.ylabel("ICA Component 2")
    plt.title(f"PPEDA (ICA) Projection for {title}")
    plt.grid(True)
    plt.show()
    
    return comps

results = {}
for k in keys:
    print(f"\nRunning PPEDA on dataset: {k}")
    results[k] = run_ppeda(posse_data[k], k)

Datasets in posse.mat: ['croix', 'struct2', 'boite', 'groupe', 'curve', 'spiral']

Running PPEDA on dataset: croix

Running PPEDA on dataset: struct2

Running PPEDA on dataset: boite

Running PPEDA on dataset: groupe

Running PPEDA on dataset: curve

Running PPEDA on dataset: spiral

from scipy.io import loadmat
import numpy as np
from scipy.stats import skew, kurtosis

mat_data = loadmat("Data Files/forearm.mat")
forearm_data = mat_data['forearm'].flatten()
n = len(forearm_data)

# estimate parameters for the normal model
mu_hat = np.mean(forearm_data)
sigma_hat = np.std(forearm_data, ddof=1) 

# calculate Observed Statistics on the original data
# Skewness (Normal = 0)
obs_skew = skew(forearm_data)
# Kurtosis (Fisher=False means Pearson's definition, Normal = 3)
obs_kurt = kurtosis(forearm_data, fisher=False)
# Central Second Moment (Variance estimate)
obs_m2 = np.var(forearm_data)

print(f"Data size (n): {n}")
print(f"Observed Stats -> Skew: {obs_skew:.4f}, Kurtosis: {obs_kurt:.4f}, 2nd Moment: {obs_m2:.4f}")

# parametric Bootstrap Loop
B = 2000 
boot_skew = np.zeros(B)
boot_kurt = np.zeros(B)
boot_m2 = np.zeros(B)

np.random.seed(1905)

for i in range(B):
    # sample from the fitted Normal Distribution N(mu_hat, sigma_hat)
    sample_b = np.random.normal(loc=mu_hat, scale=sigma_hat, size=n)
    
    # Statistics for this synthetic sample
    boot_skew[i] = skew(sample_b)
    boot_kurt[i] = kurtosis(sample_b, fisher=False)
    boot_m2[i] = np.var(sample_b)

# Bias = Average(Bootstrap Stats) - Observed Stat
bias_skew = np.mean(boot_skew) - obs_skew
bias_kurt = np.mean(boot_kurt) - obs_kurt

# Standard Error = Standard Deviation of Bootstrap Stats
se_skew = np.std(boot_skew, ddof=1)
se_kurt = np.std(boot_kurt, ddof=1)

# Bootstrap Percentile Interval for Second Moment
alpha = 0.05
ci_lower = np.percentile(boot_m2, 100 * (alpha / 2))
ci_upper = np.percentile(boot_m2, 100 * (1 - alpha / 2))

print("\n--- Parametric Bootstrap Results ---")
print(f"Skewness: Bias = {bias_skew:.5f}, SE = {se_skew:.5f}")
print(f"Kurtosis: Bias = {bias_kurt:.5f}, SE = {se_kurt:.5f}")
print("-" * 40)
print(f"Sample Central Second Moment (Original): {obs_m2:.4f}")
print(f"95% Bootstrap Percentile Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")

Data size (n): 140
Observed Stats -> Skew: -0.1092, Kurtosis: 2.5720, 2nd Moment: 1.2465

--- Parametric Bootstrap Results ---
Skewness: Bias = 0.10592, SE = 0.20377
Kurtosis: Bias = 0.37782, SE = 0.39357
----------------------------------------
Sample Central Second Moment (Original): 1.2465
95% Bootstrap Percentile Interval: [0.9656, 1.5590]

from scipy.io import loadmat

lawpop_data = loadmat("Data Files/lawpop.mat")
lsat_pop = lawpop_data['lsat'].flatten()
gpa_pop = lawpop_data['gpa'].flatten()

law_data = loadmat('Data Files/law.mat')
sample_data = law_data['law']
lsat_sample = sample_data[:, 0]  
gpa_sample = sample_data[:, 1] 

# calculate true population variances
true_var_lsat = np.var(lsat_pop, ddof=0)
true_var_gpa = np.var(gpa_pop, ddof=0)

# estimate variance from sample 
# the "sample central second moment" is the biased estimator dividing by n (ddof=0).
est_var_lsat = np.var(lsat_sample, ddof=0)
est_var_gpa = np.var(gpa_sample, ddof=0)

# Bootstrap Estimation of Bias and SE
B = 2000
boot_vars_lsat = np.zeros(B)
boot_vars_gpa = np.zeros(B)
n = len(lsat_sample)

np.random.seed(1905)

for i in range(B):
    indices = np.random.randint(0, n, n)
    b_lsat = lsat_sample[indices]
    b_gpa = gpa_sample[indices]
    
    boot_vars_lsat[i] = np.var(b_lsat, ddof=0)
    boot_vars_gpa[i] = np.var(b_gpa, ddof=0)

# Bias_est = Mean(Bootstrap_Stats) - Observed_Stat
bias_lsat = np.mean(boot_vars_lsat) - est_var_lsat
bias_gpa = np.mean(boot_vars_gpa) - est_var_gpa

# SE_est = Std(Bootstrap_Stats)
se_lsat = np.std(boot_vars_lsat, ddof=1)
se_gpa = np.std(boot_vars_gpa, ddof=1)

print(f"{'Metric':<25} | {'LSAT':<15} | {'GPA':<15}")
print("-" * 60)
print(f"{'True Population Var':<25} | {true_var_lsat:<15.4f} | {true_var_gpa:<15.4f}")
print(f"{'Sample Estimate':<25} | {est_var_lsat:<15.4f} | {est_var_gpa:<15.4f}")
print(f"{'Actual Error (Est-True)':<25} | {est_var_lsat - true_var_lsat:<15.4f} | {est_var_gpa - true_var_gpa:<15.4f}")
print("-" * 60)
print(f"{'Bootstrap Bias':<25} | {bias_lsat:<15.4f} | {bias_gpa:<15.4f}")
print(f"{'Bootstrap SE':<25} | {se_lsat:<15.4f} | {se_gpa:<15.4f}")

Metric                    | LSAT            | GPA            
------------------------------------------------------------
True Population Var       | 1463.2720       | 0.0355         
Sample Estimate           | 1630.3289       | 0.0553         
Actual Error (Est-True)   | 167.0569        | 0.0199         
------------------------------------------------------------
Bootstrap Bias            | -118.8863       | -0.0039        
Bootstrap SE              | 352.5104        | 0.0124

from scipy.stats import pearsonr

# lawpop_data and law_data are already loaded in the previous question
lsat_pop = lawpop_data['lsat'].flatten()
gpa_pop = lawpop_data['gpa'].flatten()

sample = law_data['law']
lsat_sample = sample[:, 0]
gpa_sample = sample[:, 1]
n = len(lsat_sample)

# Hypothesis Test on the Sample
# Null Hypothesis (H0): rho = 0
alpha = 0.05

r_sample, p_value = pearsonr(lsat_sample, gpa_sample)
reject_h0 = p_value < alpha

print(f"Sample Correlation (r): {r_sample:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Decision: {'Reject H0' if reject_h0 else 'Fail to Reject H0'}")

# Monte Carlo Simulation
M = 10000  
np.random.seed(1905)

# --- Type I Error Simulation (Null Hypothesis is True) ---
# We simulate H0 by sampling LSAT and GPA *independently*.
rejections_type1 = 0

for _ in range(M):
    # Independent random samples
    s_lsat = np.random.choice(lsat_pop, n, replace=False)
    s_gpa = np.random.choice(gpa_pop, n, replace=False)
    
    # Test
    _, p_val = pearsonr(s_lsat, s_gpa)
    if p_val < alpha:
        rejections_type1 += 1

type1_error = rejections_type1 / M

# --- Type II Error Simulation (Alternative Hypothesis is True) ---
# We simulate H1 by sampling *pairs* (LSAT, GPA) from the actual population.
# This preserves the true population correlation.
failures_type2 = 0

for _ in range(M):
    # sample indices 
    idx = np.random.choice(len(lsat_pop), n, replace=False)
    s_lsat = lsat_pop[idx]
    s_gpa = gpa_pop[idx]
    
    # test
    _, p_val = pearsonr(s_lsat, s_gpa)
    if p_val >= alpha: # Failed to reject H0
        failures_type2 += 1

type2_error = failures_type2 / M
power = 1 - type2_error

print("-" * 30)
print(f"Monte Carlo Simulation Results (M={M}):")
print(f"Type I Error Rate (Simulated alpha): {type1_error:.4f}")
print(f"Type II Error Rate (beta): {type2_error:.4f}")
print(f"Power of the Test (1 - beta): {power:.4f}")

Sample Correlation (r): 0.7764
P-value: 0.0007
Decision: Reject H0
------------------------------
Monte Carlo Simulation Results (M=10000):
Type I Error Rate (Simulated alpha): 0.0507
Type II Error Rate (beta): 0.0434
Power of the Test (1 - beta): 0.9566

from scipy.io import loadmat
from scipy.stats import probplot

remiss_data = loadmat("Data Files/remiss.mat")
control = remiss_data['control'].flatten()
mp = remiss_data['mp'].flatten()

# EDA: Probability Plots
# check if the data fits an Exponential distribution.
# If the points lie approximately on the straight line, the Exponential model is appropriate.
plt.figure(figsize=(12, 5))

# plot for control group
plt.subplot(1, 2, 1)
probplot(control, dist="expon", plot=plt)
plt.title("Control Group: Exponential Prob Plot")
plt.grid(True, linestyle='--', alpha=0.5)

# plot for treatment (MP) group
plt.subplot(1, 2, 2)
probplot(mp, dist="expon", plot=plt)
plt.title("Treatment (MP) Group: Exponential Prob Plot")
plt.grid(True, linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

# Monte Carlo Hypothesis Test 
# H0: Mean(Control) = Mean(MP)
# H1: Mean(Control) != Mean(MP)
# Test Statistic: Absolute difference in means

obs_mean_diff = np.abs(np.mean(control) - np.mean(mp))
print(f"Observed Difference in Means: {obs_mean_diff:.4f}")

# Setup Monte Carlo
n_sim = 10000
combined_data = np.concatenate([control, mp])
n_control = len(control)
count_extreme = 0

np.random.seed(1905)

for _ in range(n_sim):
    # Under H0, group labels are interchangeable.
    np.random.shuffle(combined_data)
    
    # split into groups 
    sim_control = combined_data[:n_control]
    sim_mp = combined_data[n_control:]
    
    sim_diff = np.abs(np.mean(sim_control) - np.mean(sim_mp))
    
    # count how often simulated difference >= observed difference
    if sim_diff >= obs_mean_diff:
        count_extreme += 1

p_value = count_extreme / n_sim

print(f"Monte Carlo P-value: {p_value:.4f}")
if p_value < 0.05:
    print("Conclusion: Reject Null Hypothesis (Significant difference in means)")
else:
    print("Conclusion: Fail to Reject Null Hypothesis")

Observed Difference in Means: 4.2143
Monte Carlo P-value: 0.0118
Conclusion: Reject Null Hypothesis (Significant difference in means)

from scipy.io import loadmat

quakes_data = loadmat("Data Files/quakes.mat")
quakes = quakes_data['quakes'].flatten()
n = len(quakes)

# Sample Mean 
mu_hat = np.mean(quakes)

print(f"Sample Size: {n}")
print(f"Estimated Mean Time Between Quakes: {mu_hat:.4f} days")

# Bootstrap for Confidence Interval
B = 2000 
boot_means = np.zeros(B)

np.random.seed(42)

for i in range(B):
    # Resample with replacement
    resample = np.random.choice(quakes, n, replace=True)
    
    # Calculate mean for this resample
    boot_means[i] = np.mean(resample)

# Construct 95% Confidence Interval 
alpha = 0.05
ci_lower = np.percentile(boot_means, 100 * (alpha / 2))
ci_upper = np.percentile(boot_means, 100 * (1 - alpha / 2))

# Calculate Standard Error from bootstrap distribution
se_boot = np.std(boot_means, ddof=1)

print("-" * 40)
print(f"Bootstrap Results (B={B}):")
print(f"Standard Error (SE): {se_boot:.4f}")
print(f"95% Bootstrap Confidence Interval:")
print(f"[{ci_lower:.4f}, {ci_upper:.4f}] days")

Sample Size: 62
Estimated Mean Time Between Quakes: 437.2097 days
----------------------------------------
Bootstrap Results (B=2000):
Standard Error (SE): 50.1872
95% Bootstrap Confidence Interval:
[342.0923, 540.9782] days

ASSIGNMENT 5¶

Question 1¶

Question 2¶

Question 3¶

Question 4¶

Question 5¶

Question 6¶

Question 7¶

Question 8¶

Question 9¶