Source code for linreg_ally.models

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import get_scorer, get_scorer_names


[docs]
def run_linear_regression(dataframe, target_column, numeric_feats, categorical_feats, drop_feats=None, test_size=0.2, random_state=None, scoring_metrics=['r2', 'neg_mean_squared_error']):
    """
    Performs linear regression with preprocessing using sklearn and outputs evaluation scoring metrics.
    
    Parameters
    ----------
    dataframe: `pandas.DataFrame`
        full dataset including features and target.
    target_column: `string`
        name of the target variable column.
    numeric_feats: `list`
        columns to apply StandardScaler.
    categorical_feats: `list`
        columns to apply OneHotEncoder.
    drop_feats: `list`, optional
        columns to drop (default None).
    test_size: `float`, optional
        proportion of the dataset to include in the test split (default 0.2).
    random_state: `int`, optional
        controls the shuffling applied to the data before the split (default None).
    scoring_metrics: `list`, optional
        scoring metrics to evaluate the model (default 'r2', 'neg_mean_squared_error').
    
    Returns
    -------
    tuple
        the fitted model
        DataFrames for the training and test features
        Series for the training and test labels
        dictionary of metric scores with metric names as keys
    
    Raises
    ------
    ValueError
        When `dataframe`, `target_column`, `test_size` or `scoring_metrics` is not within the range of acceptable values
    TypeError
        When `dataframe`, `random_state` or `scoring_metrics` is not the expected type
    
    Examples
    ---------
    >>> import pandas as pd
    >>> from linreg_ally.linreg_ally import run_linear_regression
    >>> df = pd.DataFrame({
    ...     "feature_1": [1, 2, 3, 4],
    ...     "feature_2": [0.5, 0.1, 0.4, 0.9],
    ...     "category": ["a", "b", "a", "b"],
    ...     "target": [1.0, 2.5, 3.4, 4.3]
    ... })
    >>> target_column = 'target'
    >>> numeric_feats = ['feature_1', 'feature_2']
    >>> categorical_feats = ['category']
    >>> drop_feats = []
    >>> best_model, X_train, X_test, y_train, y_test, scores = run_linear_regression(
    ...     df, target_column, numeric_feats, categorical_feats, drop_feats, scoring_metrics=['r2', 'neg_mean_squared_error']
    ... )
    >>> scores
    {'r2': 0.52, 'neg_mean_squared_error': 1.23}
    """

    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("dataframe must be a pandas DataFrame.")
    
    if dataframe.shape[1] <= 1:
        raise ValueError("dataframe must contain more than one column.")
    
    if target_column not in dataframe.columns:
        raise ValueError(f"target_column '{target_column}' is not in the dataframe.")
    
    if not (0.0 < test_size < 1.0):
        raise ValueError("test_size must be between 0.0 and 1.0.")
    
    if random_state is not None and not isinstance(random_state, int):
        raise TypeError("random_state must be an integer.")
    
    if not isinstance(scoring_metrics, list) or not all(isinstance(metric, str) for metric in scoring_metrics):
        raise TypeError("scoring_metrics must be a list of strings.")
    
    if not all(metric in get_scorer_names() for metric in scoring_metrics):
        invalid_metrics = [metric for metric in scoring_metrics if metric not in get_scorer_names()]
        raise ValueError(f"The following scoring metrics are not valid: {', '.join(invalid_metrics)}")
    
    drop_feats = drop_feats if drop_feats is not None else []

    X = dataframe.drop(columns=[target_column])
    y = dataframe[target_column]

    preprocessor = preprocess(numeric_feats, categorical_feats, drop_feats)

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    (best_model, scores) = fit_predict(pipe, X_train, X_test, y_train, y_test, scoring_metrics)

    print("Model Summary")
    print("------------------------")
    for metric, score in scores.items():
        print(f"Test {metric}: {score:.3f}")

    return best_model, X_train, X_test, y_train, y_test, scores



[docs]
def preprocess(numeric_feats, categorical_feats, drop_feats):
    return make_column_transformer(
        (StandardScaler(), numeric_feats),
        (OneHotEncoder(), categorical_feats),
        ('drop', drop_feats)
    )



[docs]
def fit_predict(pipeline, X_train, X_test, y_train, y_test, scoring_metrics):
    pipeline.fit(X_train, y_train)

    best_model = pipeline

    predictions = best_model.predict(X_test)

    scores = {}
    for metric in scoring_metrics:
        scorer = get_scorer(metric)
        scores[metric] = scorer._score_func(y_test, predictions)

    return (best_model, scores)