import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from typing import Tuple, List, Dict

# Data Generation
RANDOM_SEED: int = 42
N_SAMPLES_PER_CLUSTER: int = 10
BASE_POINTS: np.ndarray = np.array([
    [1.5, 3.5], [2.5, 1.5], [3.5, 4.5], [4.5, 2.5], [5.5, 5.0]
])
NOISE_STD_X: float = 0.5
NOISE_STD_Y: float = 2.0
TRUE_BETAS: np.ndarray = np.array([2.5, 1.0])

# Model Fitting
# We will fit OLS and LASSO for these three alpha/lambda values
LAMBDAS_TO_PLOT: List[float] = [10.0, 15.0, 20.0]
MODEL_TOLERANCE: float = 1e-8

# Plotting Grid
# Increased grid size for smoother contours over a larger area
GRID_SIZE: int = 900 
GRID_MARGIN: float = 25.0  # Default margin
NUM_CONTOUR_LEVELS: int = 5 # Number of contours to show
# Set to a tuple like (-5, 5) to override automatic plot limits
MANUAL_X_LIMITS: Tuple[float, float] | None = None
MANUAL_Y_LIMITS: Tuple[float, float] | None = None

def generate_data(
    n_per_cluster: int,
    base_pts: np.ndarray,
    true_coefs: np.ndarray,
    noise_x: float,
    noise_y: float,
    seed: int
) -> Tuple[np.ndarray, np.ndarray, int]:
    """
    Generates the clustered sample data.
    """
    np.random.seed(seed)
    
    n_clusters: int = base_pts.shape[0]
    n_samples: int = n_per_cluster * n_clusters
    
    # Repeat each base point n_per_cluster times
    x_base: np.ndarray = np.repeat(base_pts, n_per_cluster, axis=0)
    
    # Add noise to create X
    x: np.ndarray = x_base + np.random.randn(n_samples, 2) * noise_x
    
    # Create y using true coefficients and add noise
    y: np.ndarray = x.dot(true_coefs) + np.random.randn(n_samples) * noise_y
    
    return x, y, n_samples


def fit_models(
    x: np.ndarray, y: np.ndarray, lambdas: List[float], tol: float
) -> Tuple[np.ndarray, Dict[float, np.ndarray]]:
    """
    Fits OLS and LASSO models for each specified lambda.
    
    Returns:
        - OLS solution coefficients.
        - A dictionary mapping {lambda: lasso_solution_coefficients}.
    """
    # OLS solution
    ols: LinearRegression = LinearRegression(fit_intercept=False)
    ols.fit(x, y)
    ols_solution: np.ndarray = ols.coef_
    print(f"OLS Solution (b1, b2): {ols_solution}")
    
    # LASSO solutions
    lasso_solutions: Dict[float, np.ndarray] = {}
    for lambda_val in lambdas:
        lasso: Lasso = Lasso(alpha=lambda_val, fit_intercept=False, tol=tol)
        lasso.fit(x, y)
        lasso_solutions[lambda_val] = lasso.coef_
        print(f"Lasso (lambda={lambda_val}) Solution: {lasso_solutions[lambda_val]}")
        
    return ols_solution, lasso_solutions


def calculate_rss_grid(
    x: np.ndarray, y: np.ndarray, b1: np.ndarray, b2: np.ndarray
) -> np.ndarray:
    """
    Calculates the RSS for every (b1, b2) pair on the grid in a 
    vectorized way.
    """
    # Reshape grid and y for broadcasting
    # b_flat is (grid_size*grid_size, 2)
    b_flat: np.ndarray = np.stack([b1.ravel(), b2.ravel()], axis=1)
    
    # y_col is (n_samples, 1)
    y_col: np.ndarray = y[:, np.newaxis]
    
    # Calculate all predictions at once
    # x is (n_samples, 2), b_flat.T is (2, grid_size*grid_size)
    # all_preds is (n_samples, grid_size*grid_size)
    all_preds: np.ndarray = x @ b_flat.T
    
    # Calculate all residuals
    # y_col broadcasts against all_preds
    # all_residuals is (n_samples, grid_size*grid_size)
    all_residuals: np.ndarray = y_col - all_preds
    
    # Sum of squares for each (b1, b2) pair
    # rss_flat is (grid_size*grid_size,)
    rss_flat: np.ndarray = np.sum(all_residuals**2, axis=0)
    
    # Reshape back to grid shape
    return rss_flat.reshape(b1.shape)


def calculate_loss_grids(
    x: np.ndarray, 
    y: np.ndarray, 
    ols_solution: np.ndarray, 
    grid_size: int, 
    margin: float
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Generates the coordinate and RSS grids for plotting.
    """
    # Create a grid of b1 and b2 values
    b1_range: np.ndarray = np.linspace(
        ols_solution[0] - margin, ols_solution[0] + margin, grid_size
    )
    b2_range: np.ndarray = np.linspace(
        ols_solution[1] - margin, ols_solution[1] + margin, grid_size
    )
    b1_grid, b2_grid = np.meshgrid(b1_range, b2_range)
    
    # Calculate RSS (Residual Sum of Squares) for every point on the grid
    print(f"Calculating RSS grid ({grid_size}x{grid_size} points)...")
    rss_grid: np.ndarray = calculate_rss_grid(x, y, b1_grid, b2_grid)
    
    return b1_grid, b2_grid, rss_grid


def plot_classic_lasso_contour(
    ax: plt.Axes,
    b1_grid: np.ndarray,
    b2_grid: np.ndarray,
    rss_grid: np.ndarray,
    ols_sol: np.ndarray,
    lasso_sol: np.ndarray,
    lambda_val: float,
    lasso_rss_val: float
) -> None:
    """
    Plots the classic 2D LASSO contour graph onto a specific axis.
    """
    
    # 1. Define the specific RSS contour levels
    ols_rss: float = np.min(rss_grid)
    end_level: float = lasso_rss_val
    
    # Create levels from OLS (start) to LASSO (end)
    # We add +2 to get N levels *between* OLS and LASSO
    all_levels: np.ndarray = np.linspace(ols_rss, end_level, NUM_CONTOUR_LEVELS + 2)
    
    # Skip the 0-th level (the OLS solution itself)
    # and take unique values to handle edge cases where
    # ols_rss is very close to lasso_rss_val.
    all_levels = np.unique(all_levels[1:])

    # 2. Plot the RSS Contours (ellipses)
    ax.contour(b1_grid, b2_grid, rss_grid, 
               levels=all_levels, cmap='Blues_r', alpha=0.7)
    
    # Re-plot the "touching" contour with a thicker line
    ax.contour(b1_grid, b2_grid, rss_grid, 
               levels=[lasso_rss_val], cmap='Blues_r', linewidths=2)

    # 3. Plot the OLS Solution (center of ellipses)
    ax.scatter(ols_sol[0], ols_sol[1],
               marker='x', s=150, c='blue', label='OLS Solution')

    # 4. Create the L1 "Diamond" Constraint
    # The solution lies on the boundary t = |b1| + |b2|
    t: float = np.sum(np.abs(lasso_sol))
    
    diamond_x: List[float] = [t, 0, -t, 0, t]
    diamond_y: List[float] = [0, t, 0, -t, 0]

    ax.plot(diamond_x, diamond_y, 'r-',
             label=f'L1 Constraint: |b1|+|b2| <= {t:.2f}')

    # 5. Plot the LASSO Solution
    ax.scatter(lasso_sol[0], lasso_sol[1],
               marker='o', s=150, c='red', label='LASSO Solution')

    ax.set_xlabel(r'$\beta_1$', fontsize=12)
    ax.set_title(f'LASSO 2D Path (lambda = {lambda_val})', fontsize=16)

    ax.axhline(0, color='black', linestyle='--', linewidth=0.5)
    ax.axhline(0, color='black', linestyle='--', linewidth=0.5)
    ax.legend()
    ax.grid(True, linestyle=':', alpha=0.6)
    
    # Enforce equal aspect ratio
    ax.set_aspect('equal', adjustable='box')


def plot_lasso_comparison(
    x: np.ndarray,
    y: np.ndarray,
    b1_grid: np.ndarray,
    b2_grid: np.ndarray,
    rss_grid: np.ndarray,
    ols_solution: np.ndarray,
    lasso_solutions: Dict[float, np.ndarray],
    plot_limit_dist: float | None = None,
    manual_xlim: Tuple[float, float] | None = None,
    manual_ylim: Tuple[float, float] | None = None
) -> None:
    """
    Creates the 3-panel side-by-side comparison plot.
    """
    lambdas: List[float] = list(lasso_solutions.keys())
    
    fig, axes = plt.subplots(1, 3, figsize=(24, 8), sharex=True, sharey=True)
    
    fig.suptitle('LASSO Coefficient Path vs. OLS Solution', fontsize=20)
    
    for i, lambda_val in enumerate(lambdas):
        ax: plt.Axes = axes[i]
        lasso_sol: np.ndarray = lasso_solutions[lambda_val]
        
        # Calculate the specific RSS value at the LASSO solution
        predictions: np.ndarray = x @ lasso_sol
        residuals: np.ndarray = y - predictions
        lasso_rss: float = np.sum(residuals**2)
        
        plot_classic_lasso_contour(
            ax, b1_grid, b2_grid, rss_grid,
            ols_solution, lasso_sol, lambda_val, lasso_rss
        )
        if i == 0:
            ax.set_ylabel(r'$\beta_2$', fontsize=12)

    # Determine plot boundaries
    if manual_xlim is not None and manual_ylim is not None:
        print(f"Using manual plot limits: x={manual_xlim}, y={manual_ylim}")
        axes[0].set_xlim(manual_xlim)
        axes[0].set_ylim(manual_ylim)
    elif plot_limit_dist is not None:
        # Center the plot around 0,0
        x_center, y_center = 0.0, 0.0
        print(f"Using automatic plot limits: [+/-{plot_limit_dist:.2f}]")
        axes[0].set_xlim(x_center - plot_limit_dist, x_center + plot_limit_dist)
        axes[0].set_ylim(y_center - plot_limit_dist, y_center + plot_limit_dist)
    else:
        # Fallback, should not happen if main() logic is correct
        print("Warning: No plot limits set. Using matplotlib defaults.")
        pass

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

def plot_lasso() -> None:
    """
    Main function to run the data generation, model fitting,
    and plotting.
    """
    # 1. Generate Data
    x, y, n_samples = generate_data(
        N_SAMPLES_PER_CLUSTER, BASE_POINTS, TRUE_BETAS, 
        NOISE_STD_X, NOISE_STD_Y, RANDOM_SEED
    )
    
    # 2. Fit Models
    ols_solution, lasso_solutions = fit_models(
        x, y, LAMBDAS_TO_PLOT, MODEL_TOLERANCE
    )
    
    # 3. Calculate Grids
    print("Calculating plot and grid boundaries...")
    
    plot_limit_dist: float | None = None
    x_center, y_center = 0.0, 0.0
    grid_margin_x: float = 0.0
    grid_margin_y: float = 0.0
    
    if MANUAL_X_LIMITS is None or MANUAL_Y_LIMITS is None:
        print("Calculating automatic plot boundaries...")
        # Find all points that define the plot limits
        all_points_for_limits: List[np.ndarray] = [ols_solution]
        all_points_for_limits.extend(lasso_solutions.values())
        for sol in lasso_solutions.values():
            t: float = np.sum(np.abs(sol))
            all_points_for_limits.extend([
                np.array([t, 0]), np.array([-t, 0]),
                np.array([0, t]), np.array([0, -t])
            ])
        all_points_np: np.ndarray = np.array(all_points_for_limits)
        
        # 1. Determine the PLOT limit (the viewing window)
        max_dist_x: float = np.max(np.abs(all_points_np[:, 0] - x_center))
        max_dist_y: float = np.max(np.abs(all_points_np[:, 1] - y_center))
        # This is the final plot limit (half-width)
        plot_limit_dist = max(max_dist_x, max_dist_y) * 1.2 # 20% pad
        print(f"Automatic plot limits calculated: [+/-{plot_limit_dist:.2f}]")
        
        # 2. Determine the GRID margin (the calculation area)
        grid_margin_x = abs(ols_solution[0] - x_center) + plot_limit_dist
        grid_margin_y = abs(ols_solution[1] - y_center) + plot_limit_dist
    
    else:
        print("Using manual plot limits for grid calculation.")
        x_min, x_max = MANUAL_X_LIMITS
        y_min, y_max = MANUAL_Y_LIMITS
        # Find the distance from OLS center to the furthest corner of the manual box
        grid_margin_x = max(abs(ols_solution[0] - x_min), abs(ols_solution[0] - x_max))
        grid_margin_y = max(abs(ols_solution[1] - y_min), abs(ols_solution[1] - y_max))

    # Add a small buffer so contours don't end exactly at the plot edge
    grid_margin_dynamic: float = max(grid_margin_x, grid_margin_y) * 1.1 
    
    print(f"Calculating grid with margin: {grid_margin_dynamic:.2f}")

    b1_grid, b2_grid, rss_grid = calculate_loss_grids(
        x, y, ols_solution, GRID_SIZE, grid_margin_dynamic
    )
    
    # 4. Generate the final 3-panel plot
    print(f"\nDisplaying 3-panel comparison plot...")
    plot_lasso_comparison(
        x, y, b1_grid, b2_grid, rss_grid, ols_solution, lasso_solutions,
        plot_limit_dist=plot_limit_dist,
        manual_xlim=(-6.5,6.5),
        manual_ylim=(-6.5,6.5)
    )
    
    print("All plots complete.")

plot_lasso()

OLS Solution (b1, b2): [2.39792215 1.07149962]
Lasso (lambda=10.0) Solution: [2.13372055 0.56668381]
Lasso (lambda=15.0) Solution: [2.00161967 0.31427598]
Lasso (lambda=20.0) Solution: [1.86951881 0.06186813]
Calculating plot and grid boundaries...
Calculating automatic plot boundaries...
Automatic plot limits calculated: [+/-3.24]
Calculating grid with margin: 6.20
Calculating RSS grid (900x900 points)...

Displaying 3-panel comparison plot...
Using manual plot limits: x=(-6.5, 6.5), y=(-6.5, 6.5)

All plots complete.

from typing import Optional

def plot_lp_norm(
    ax: plt.Axes, 
    p: float, 
    radius: float = 1.0, 
    color: str = 'blue', 
    title: Optional[str] = None
) -> None:
    """
    Plots the contour for the L_p norm (|x|^p + |y|^p)^(1/p) = radius.
    For p=1 (L1 norm), p=2 (L2 norm).
    """
    # Generate points for the positive quadrant and then reflect for other quadrants
    angles: np.ndarray = np.linspace(0, np.pi / 2, 100)
    
    # Use np.power for safe element-wise power
    x: np.ndarray = radius * np.power(np.cos(angles), 2/p)
    y: np.ndarray = radius * np.power(np.sin(angles), 2/p)

    # Reflect to get all four quadrants
    x_full: np.ndarray = np.concatenate([x, -x[::-1], -x, x[::-1]])
    y_full: np.ndarray = np.concatenate([y, y[::-1], -y, -y[::-1]])

    ax.plot(x_full, y_full, color=color, linewidth=2.5)
    ax.set_aspect('equal', adjustable='box')
    ax.axhline(0, color='gray', linestyle='--', linewidth=0.5)
    ax.axvline(0, color='gray', linestyle='--', linewidth=0.5)
    ax.set_xlim(-radius * 1.3, radius * 1.3)
    ax.set_ylim(-radius * 1.3, radius * 1.3)
    ax.set_xticks([])
    ax.set_yticks([])
    if title:
        ax.set_title(title, fontsize=14)

def plot_elastic_net_contour(
    ax: plt.Axes, 
    l1_ratio: float, 
    radius: float = 1.0, 
    color: str = 'purple', 
    title: Optional[str] = None
) -> None:
    """
    Plots the contour for the Elastic Net penalty: 
    l1_ratio * (|b1| + |b2|) + (1 - l1_ratio) * (b1^2 + b2^2) = C.
    
    The constant C is chosen such that the contour's max coordinate
    (the L-infinity norm) is approximately equal to 'radius'.
    """
    # Create a dense grid of b1 and b2 values
    grid_res: int = 300
    b_max: float = radius * 1.5 # Extend grid slightly beyond target radius
    b1_vals: np.ndarray = np.linspace(-b_max, b_max, grid_res)
    b2_vals: np.ndarray = np.linspace(-b_max, b_max, grid_res)
    B1, B2 = np.meshgrid(b1_vals, b2_vals)

    # Calculate the Elastic Net penalty for each point on the grid
    # Note: sklearn's ElasticNet uses 0.5 * (1-l1_ratio) for L2
    # We'll stick to the simpler (1-l1_ratio) for visualization
    l2_ratio: float = 1.0 - l1_ratio
    penalty_grid: np.ndarray = l1_ratio * (np.abs(B1) + np.abs(B2)) + \
                               l2_ratio * (B1**2 + B2**2)

    # We set the contour level 'C' such that the point (radius, 0)
    # lies on the boundary.
    # C = l1_ratio * |radius| + l2_ratio * (radius^2)
    c_target: float = l1_ratio * radius + l2_ratio * (radius**2)

    # Plot the contour at the target C value
    ax.contour(
        B1, B2, penalty_grid, 
        levels=[c_target], 
        colors=[color], 
        linewidths=2.5, 
        linestyles='solid'
    )

    ax.set_aspect('equal', adjustable='box')
    ax.axhline(0, color='gray', linestyle='--', linewidth=0.5)
    ax.axvline(0, color='gray', linestyle='--', linewidth=0.5)
    ax.set_xlim(-radius * 1.3, radius * 1.3)
    ax.set_ylim(-radius * 1.3, radius * 1.3)
    ax.set_xticks([])
    ax.set_yticks([])
    if title:
        ax.set_title(title, fontsize=14)

# --- Main Plotting Script ---
def plot_eslastic_net():
    """
    Generates and displays the 1x5 plot of L1, L2, and Elastic Net contours.
    """
    # Create a 1x5 figure
    fig, axes = plt.subplots(1, 5, figsize=(20, 5)) 

    radius: float = 1.0 # Standard radius for all shapes

    # 1. L1 Norm (Diamond)
    plot_lp_norm(
        axes[0], p=1.0, radius=radius, color='darkblue', 
        title='L1 Norm (LASSO)'
    )

    # 2. L2 Norm (Circle)
    plot_lp_norm(
        axes[1], p=2.0, radius=radius, color='darkgreen', 
        title='L2 Norm (Ridge)'
    )

    # 3. Elastic Net (L1 Ratio = 0.25) - More circular
    plot_elastic_net_contour(
        axes[2], l1_ratio=0.25, radius=radius, color='darkred', 
        title='Elastic Net (L1 Ratio=0.25)'
    )

    # 4. Elastic Net (L1 Ratio = 0.5) - Mid-point
    plot_elastic_net_contour(
        axes[3], l1_ratio=0.5, radius=radius, color='purple', 
        title='Elastic Net (L1 Ratio=0.5)'
    )
    
    # 5. Elastic Net (L1 Ratio = 0.75) - More square-like
    plot_elastic_net_contour(
        axes[4], l1_ratio=0.75, radius=radius, color='darkgoldenrod', 
        title='Elastic Net (L1 Ratio=0.75)'
    )
    
    fig.suptitle('Visualization of Regularization Penalty Contours', fontsize=20, y=1.05)
    
    # Adjust layout to prevent title overlap
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

plot_eslastic_net()

# pip install scikit-optimize

import numpy as np
import pandas as pd
from typing import Tuple, Dict, List

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from skopt import gp_minimize
import skopt.space  # Import the whole module
from skopt.utils import use_named_args

def create_correlated_dataset(n_samples: int = 1000, n_features: int = 20) -> \
                               Tuple[pd.DataFrame, pd.Series, np.ndarray]:
    """
    Generates a dataset with 20 features and specific correlations.

    - Features 0 & 1: Linearly correlated
    - Features 2 & 3: Correlated at the cubic level
    - Features 4-19: Also related to y (no pure noise)
    - y: A linear combination of ALL 20 features, plus significant noise.
    """
    print(f"Generating dataset with {n_samples} samples and {n_features} features...")
    # The global np.random.seed(42) call at the top of the script
    # already ensures this function's operations are reproducible.
    
    # 1. Start with a base of random data
    X_data = np.random.rand(n_samples, n_features)
    
    # 2. Create the linear correlation (x1 = 0.9*x0 + noise)
    # Increased to 0.9 for stronger multicollinearity
    X_data[:, 1] = 0.9 * X_data[:, 0] + np.random.rand(n_samples) * 0.1
    
    # 3. Create the cubic correlation (x3 = 0.7*x2^3 + noise)
    X_data[:, 3] = 0.7 * (X_data[:, 2]**3) + np.random.rand(n_samples) * 0.05
    
    # 4. Define the true coefficients
    # All 20 features will have an effect.
    true_coefficients = np.zeros(n_features)
    true_coefficients[0] = 5.0   # Coeff for x0
    true_coefficients[1] = -3.5  # Coeff for x1 (linear)
    true_coefficients[2] = 2.0   # Coeff for x2
    true_coefficients[3] = -1.5  # Coeff for x3 (cubic)
    
    # Assign non-zero coefficients to the remaining 16 features
    # Use a reproducible random draw for these coefficients
    true_coefficients[4:] = np.random.uniform(-2.5, 2.5, size=n_features - 4)
    
    # 5. Create the target variable 'y'
    # y is a linear combination of the features, using the true coefficients
    # We increase the noise here (e.g., to 7.0) to make the model
    # overfit, giving the LASSO alpha a reason to increase.
    y_noise_std = 2.0
    y_data = X_data @ true_coefficients + np.random.randn(n_samples) * y_noise_std
    
    # 6. Create a DataFrame for clarity
    feature_names = [f'x{i}' for i in range(n_features)]
    X_df = pd.DataFrame(X_data, columns=feature_names)
    y_series = pd.Series(y_data, name='y')
    
    return X_df, y_series, true_coefficients


def run_bayesian_optimization(X_train: pd.DataFrame, y_train: pd.Series) -> float:
    """
    Runs Bayesian Optimization to find the best 'alpha' for LASSO.
    
    The optimizer works by minimizing an objective function. Our objective
    function will be the cross-validated Mean Squared Error (MSE).
    """
    print("\nStarting Bayesian Optimization for LASSO 'alpha'...")
    
    # 1. Define the search space for 'alpha'
    # We search on a log scale, as alpha's effect is multiplicative.
    search_space = [
        # Use the fully qualified name to avoid NameError
        skopt.space.Real(1e-6, 100.0, prior='log-uniform', name='alpha')
    ]
    
    # 2. Define the objective function
    # This function will be called by the optimizer.
    # It takes 'alpha' as input and returns the error score to minimize.
    @use_named_args(search_space)
    def objective(alpha: float) -> float:
        """
        Objective function for the optimizer.
        Returns the average cross-validated Mean Squared Error.
        """
        # We MUST use a pipeline to scale data correctly inside
        # each cross-validation fold. This prevents data leakage.
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('lasso', Lasso(alpha=alpha, tol=0.01, max_iter=2000, random_state=42))
        ])
        
        # Get cross-val scores. We use 'neg_mean_squared_error'
        # which returns negative MSE (e.g., -0.25)
        scores = cross_val_score(
            pipeline,
            X_train,
            y_train,
            cv=5,  # 5-fold cross-validation
            scoring='neg_mean_squared_error'
        )
        
        # We want to MINIMIZE the error. 'scores' are negative.
        # So we take the mean (e.g., -0.25) and return its
        # absolute value (0.25). The optimizer will minimize this.
        return np.abs(np.mean(scores))

    # 3. Run the optimization
    # n_calls = number of times to call the 'objective' function
    # n_initial_points = how many times to sample randomly before building the model
    result = gp_minimize(
        objective,
        search_space,
        n_calls=30,
        n_initial_points=10,
        random_state=42,
        verbose=False
    )
    
    best_alpha = result.x[0]
    best_mse = result.fun
    
    print(f"Optimization complete.")
    print(f"Best 'alpha' found: {best_alpha:.6f}")
    print(f"Corresponding Cross-Validated MSE: {best_mse:.6f}")
    
    return best_alpha

def bayes_opt_lasso():
    """
    Main function to run the full workflow.
    """
    # 1. Generate the complex dataset
    X, y, true_coefficients = create_correlated_dataset(n_samples=1000, n_features=20)
    
    # 2. Split data into training and a final, held-out test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # 3. Find the best hyperparameter (alpha) using only the training data
    best_alpha = run_bayesian_optimization(X_train, y_train)
    
    # 4. Train the final model
    # We build the final pipeline and train it on the *entire*
    # training set, using the optimal alpha we just found.
    print("\nTraining final model with best 'alpha'...")
    final_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', Lasso(alpha=best_alpha, random_state=42))
    ])
    
    final_pipeline.fit(X_train, y_train)
    
    # 5. Evaluate the final model on the held-out test set
    # This gives an unbiased estimate of its performance.
    y_pred = final_pipeline.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred)
    print(f"Final Model MSE on (unseen) test data: {test_mse:.6f}")
    
    # 6. Reconstruct and compare coefficients
    print("\n--- Coefficient Reconstruction ---")
    
    # Get components from the trained pipeline
    scaler = final_pipeline.named_steps['scaler']
    lasso_model = final_pipeline.named_steps['lasso']
    
    # The model's coefficients are for the *scaled* data.
    recovered_coeffs_scaled = lasso_model.coef_
    
    # To compare them to our true coefficients, we must un-scale them.
    # The formula is: unscaled_coeff = scaled_coeff / scaler.scale_
    # We must handle division by zero if a feature had zero variance (unlikely)
    scaler_scale = scaler.scale_
    scaler_scale[scaler_scale == 0] = 1.0  # Avoid division by zero
    
    recovered_coeffs_unscaled = recovered_coeffs_scaled / scaler_scale
    
    # 7. Display results in a clear table
    results_df = pd.DataFrame({
        'Feature': X.columns,
        'True Coefficient': true_coefficients,
        'Recovered Coefficient (LASSO)': recovered_coeffs_unscaled
    })
    
    # Round for clarity
    results_df['True Coefficient'] = \
        results_df['True Coefficient'].round(8)
        
    # FIX: Corrected the typo from 'RecoverED' to 'Recovered'
    results_df['Recovered Coefficient (LASSO)'] = \
        results_df['Recovered Coefficient (LASSO)'].round(8)
        
    print(results_df.to_string())

    # --- Analysis of Results ---
    # We have 20 true predictors. We expect LASSO to shrink
    # all coefficients, but ideally, it should not zero out
    # any *true* predictors. With correlated features, it will be
    # interesting to see how it distributes the coefficients.

# Set a random seed for reproducibility
np.random.seed(42)

# Run the optimization
bayes_opt_lasso()

Generating dataset with 1000 samples and 20 features...

Starting Bayesian Optimization for LASSO 'alpha'...
Optimization complete.
Best 'alpha' found: 0.026551
Corresponding Cross-Validated MSE: 4.210436

Training final model with best 'alpha'...
Final Model MSE on (unseen) test data: 4.216186

--- Coefficient Reconstruction ---
   Feature  True Coefficient  Recovered Coefficient (LASSO)
0       x0          5.000000                       1.401596
1       x1         -3.500000                       0.000000
2       x2          2.000000                       1.042879
3       x3         -1.500000                      -0.000000
4       x4          1.323180                       1.340618
5       x5         -2.467418                      -2.326836
6       x6         -0.114750                      -0.243877
7       x7          1.469522                       1.427191
8       x8          0.650224                       0.956561
9       x9         -0.129684                      -0.245890
10     x10          0.723210                       0.088863
11     x11          2.034730                       1.719815
12     x12          0.331909                       0.222191
13     x13          1.498931                       1.210466
14     x14          2.403714                       1.961393
15     x15          1.027841                       0.861661
16     x16          0.366629                       0.263217
17     x17         -2.312570                      -1.943411
18     x18         -0.627611                      -0.933246
19     x19          0.327214                       0.501218

Least Absolute Shrinkage and Selection Operator (LASSO)¶

1. Big Data Challenge:¶

2. Variable Selection Methods¶

2.1 The Best Subset¶

2.2 Forward Selection¶

2.3 Backward Selection¶

2.4 Issues¶

3. Ordinary Least Squares (OLS)¶

4.Regularized Regression¶

4.1 LASSO ($L1$)¶

4.1.1 Objective¶

4.1.2 LASSO Fitting¶

4.1.2.1 Overveiw¶

4.1.2.2 The Math Behind $L1$ Optimization¶

4.2 RIDGE ($L2$)¶

5. Geometric Interpretation¶

5.1 The LASSO Solution: Intersection of RSS Contours and the L1 Constraint¶

5.1.1 Libraries¶

5.1.2 Configuration¶

5.1.3 Functions¶

5.1.4 Main Script¶

5.1.5 The LASSO Intersection Plots¶

5.2 From LASSO to Elastic Net¶

5.2.1 Overview¶

5.2.2. Visualizing the Elastic Net Constraint Region¶

5.2.2.1 Libraries¶

5.2.2.2 Functions¶

5.2.2.3 Main Script¶

5.2.2.4 Elastic Net Constraint Region¶

6 Advanced Topics¶

6.1 Confidence Intervals for LASSO Regression¶

6.2 Automated $\lambda$ selection¶

6.2.1 Overview¶

6.2.2.1 Libraries¶

6.2.2.2 Functions¶

6.2.2.3 Bayesian Optimization¶

6.2.2.4 Automated $\lambda$ Selection¶