Source code for linreg_ally.models

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import get_scorer, get_scorer_names

[docs] def run_linear_regression(dataframe, target_column, numeric_feats, categorical_feats, drop_feats=None, test_size=0.2, random_state=None, scoring_metrics=['r2', 'neg_mean_squared_error']): """ Performs linear regression with preprocessing using sklearn and outputs evaluation scoring metrics. Parameters ---------- dataframe: `pandas.DataFrame` full dataset including features and target. target_column: `string` name of the target variable column. numeric_feats: `list` columns to apply StandardScaler. categorical_feats: `list` columns to apply OneHotEncoder. drop_feats: `list`, optional columns to drop (default None). test_size: `float`, optional proportion of the dataset to include in the test split (default 0.2). random_state: `int`, optional controls the shuffling applied to the data before the split (default None). scoring_metrics: `list`, optional scoring metrics to evaluate the model (default 'r2', 'neg_mean_squared_error'). Returns ------- tuple the fitted model DataFrames for the training and test features Series for the training and test labels dictionary of metric scores with metric names as keys Raises ------ ValueError When `dataframe`, `target_column`, `test_size` or `scoring_metrics` is not within the range of acceptable values TypeError When `dataframe`, `random_state` or `scoring_metrics` is not the expected type Examples --------- >>> import pandas as pd >>> from linreg_ally.linreg_ally import run_linear_regression >>> df = pd.DataFrame({ ... "feature_1": [1, 2, 3, 4], ... "feature_2": [0.5, 0.1, 0.4, 0.9], ... "category": ["a", "b", "a", "b"], ... "target": [1.0, 2.5, 3.4, 4.3] ... }) >>> target_column = 'target' >>> numeric_feats = ['feature_1', 'feature_2'] >>> categorical_feats = ['category'] >>> drop_feats = [] >>> best_model, X_train, X_test, y_train, y_test, scores = run_linear_regression( ... df, target_column, numeric_feats, categorical_feats, drop_feats, scoring_metrics=['r2', 'neg_mean_squared_error'] ... ) >>> scores {'r2': 0.52, 'neg_mean_squared_error': 1.23} """ if not isinstance(dataframe, pd.DataFrame): raise TypeError("dataframe must be a pandas DataFrame.") if dataframe.shape[1] <= 1: raise ValueError("dataframe must contain more than one column.") if target_column not in dataframe.columns: raise ValueError(f"target_column '{target_column}' is not in the dataframe.") if not (0.0 < test_size < 1.0): raise ValueError("test_size must be between 0.0 and 1.0.") if random_state is not None and not isinstance(random_state, int): raise TypeError("random_state must be an integer.") if not isinstance(scoring_metrics, list) or not all(isinstance(metric, str) for metric in scoring_metrics): raise TypeError("scoring_metrics must be a list of strings.") if not all(metric in get_scorer_names() for metric in scoring_metrics): invalid_metrics = [metric for metric in scoring_metrics if metric not in get_scorer_names()] raise ValueError(f"The following scoring metrics are not valid: {', '.join(invalid_metrics)}") drop_feats = drop_feats if drop_feats is not None else [] X = dataframe.drop(columns=[target_column]) y = dataframe[target_column] preprocessor = preprocess(numeric_feats, categorical_feats, drop_feats) pipe = Pipeline([ ('preprocessor', preprocessor), ('model', LinearRegression()) ]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) (best_model, scores) = fit_predict(pipe, X_train, X_test, y_train, y_test, scoring_metrics) print("Model Summary") print("------------------------") for metric, score in scores.items(): print(f"Test {metric}: {score:.3f}") return best_model, X_train, X_test, y_train, y_test, scores
[docs] def preprocess(numeric_feats, categorical_feats, drop_feats): return make_column_transformer( (StandardScaler(), numeric_feats), (OneHotEncoder(), categorical_feats), ('drop', drop_feats) )
[docs] def fit_predict(pipeline, X_train, X_test, y_train, y_test, scoring_metrics): pipeline.fit(X_train, y_train) best_model = pipeline predictions = best_model.predict(X_test) scores = {} for metric in scoring_metrics: scorer = get_scorer(metric) scores[metric] = scorer._score_func(y_test, predictions) return (best_model, scores)