from dsc80_utils import *
import lec15_util as util

tips = px.data.tips()

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])

pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])

OneHotEncoder()

LinearRegression()

pl.predict(tips_cat.iloc[:5])

array([3.1 , 3.27, 3.27, 3.27, 3.1 ])

pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}

pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

pl.named_steps['one-hot'].get_feature_names_out()

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

pl.named_steps['lin-reg'].coef_

array([-0.09,  0.09, -0.04,  0.04, -0.2 , -0.13,  0.14,  0.19,  0.25,
       -0.25])

# Why is this so low?
pl.score(tips_cat, tips['tip'])

0.02749679020147555

from sklearn.compose import ColumnTransformer

tips_features = tips.drop('tip', axis=1)
tips_features.head()

from sklearn.preprocessing import Binarizer

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
    remainder='passthrough',
    # Keep original dtypes for remaining columns
    force_int_remainder_cols=False,
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])

pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('size', Binarizer(threshold=2), ['size']),
                                ('categorical_cols', OneHotEncoder(),
                                 ['sex', 'smoker', 'day', 'time'])])

['size']

Binarizer(threshold=2)

['sex', 'smoker', 'day', 'time']

OneHotEncoder()

['total_bill']

passthrough

LinearRegression()

tips_features.head()

# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

array([2.74, 2.32, 3.37, 3.37, 3.75])

from sklearn.preprocessing import FunctionTransformer

f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

array([1.  , 1.41, 1.73])

# Old code

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' 
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])
pl

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

ColumnTransformer(remainder='passthrough',
                  transformers=[('size', Binarizer(threshold=2), ['size']),
                                ('categorical_cols', OneHotEncoder(),
                                 ['sex', 'smoker', 'day', 'time'])])

['size']

Binarizer(threshold=2)

['sex', 'smoker', 'day', 'time']

OneHotEncoder()

passthrough

LinearRegression()

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

preproc = make_column_transformer(
    (Binarizer(threshold=2), ['size']),
    (OneHotEncoder(), ['sex', 'smoker', 'day', 'time']),
    remainder='passthrough',
)

pl = make_pipeline(preproc, LinearRegression())
# Notice that the steps in the pipeline and column transformer are
# automatically named
pl

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('linearregression', LinearRegression())])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('linearregression', LinearRegression())])

ColumnTransformer(remainder='passthrough',
                  transformers=[('binarizer', Binarizer(threshold=2), ['size']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['sex', 'smoker', 'day', 'time'])])

['size']

Binarizer(threshold=2)

['sex', 'smoker', 'day', 'time']

OneHotEncoder()

passthrough

LinearRegression()

# Let's define these once, since we'll use them repeatedly.
X = tips[['total_bill', 'size']]
y = tips['tip']

from sklearn.preprocessing import StandardScaler

model_with_std = make_pipeline(
    StandardScaler(),
    LinearRegression(),
)

model_with_std.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

StandardScaler()

LinearRegression()

model_with_std.score(X, y)

0.46786930879612587

from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y, model_with_std.predict(X))

np.float64(1.007256127114662)

model_without_std = LinearRegression()
model_without_std.fit(X, y)

LinearRegression()

LinearRegression()

model_without_std.score(X, y)

0.46786930879612587

root_mean_squared_error(y, model_without_std.predict(X))

np.float64(1.007256127114662)

# Total bill, table size.
model_without_std.coef_

array([0.09, 0.19])

# Total bill, table size.
model_with_std.named_steps['linearregression'].coef_

array([0.82, 0.18])

def is_weekend(s):
    # The input to is_weekend is a Series!
    return s.replace({'Thur': 'Weekday', 'Fri': 'Weekday'})

pl_day = make_pipeline(
    FunctionTransformer(is_weekend),
    OneHotEncoder(),
)

col_trans = make_column_transformer(
    (pl_day, ['day']),
    (OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    (Binarizer(threshold=2), ['size']),
    remainder='passthrough',
    force_int_remainder_cols=False,
)

pl = make_pipeline(
    col_trans,
    LinearRegression(),
)

pl.fit(tips.drop('tip', axis=1), tips['tip'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('functiontransformer',
                                                                   FunctionTransformer(func=<function is_weekend at 0x283024180>)),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  ['day']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'time']),
                                                 ('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size'])])),
                ('linearregression', LinearRegression())])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('functiontransformer',
                                                                   FunctionTransformer(func=<function is_weekend at 0x283024180>)),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  ['day']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'time']),
                                                 ('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size'])])),
                ('linearregression', LinearRegression())])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('pipeline',
                                 Pipeline(steps=[('functiontransformer',
                                                  FunctionTransformer(func=<function is_weekend at 0x283024180>)),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 ['day']),
                                ('onehotencoder', OneHotEncoder(drop='first'),
                                 ['sex', 'smoker', 'time']),
                                ('binarizer', Binarizer(threshold=2),
                                 ['size'])])

['day']

FunctionTransformer(func=<function is_weekend at 0x283024180>)

OneHotEncoder()

['sex', 'smoker', 'time']

OneHotEncoder(drop='first')

['size']

Binarizer(threshold=2)

pl.named_steps

{'columntransformer': ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('functiontransformer',
                                                   FunctionTransformer(func=<function is_weekend at 0x283024180>)),
                                                  ('onehotencoder',
                                                   OneHotEncoder())]),
                                  ['day']),
                                 ('onehotencoder', OneHotEncoder(drop='first'),
                                  ['sex', 'smoker', 'time']),
                                 ('binarizer', Binarizer(threshold=2),
                                  ['size'])]),
 'linearregression': LinearRegression()}

people_path = Path('data') / 'SOCR-HeightWeight.csv'
people = pd.read_csv(people_path).drop(columns=['Index'])
people.head()

people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds')

X = people[['Height (Inches)']]
y = people['Weight (Pounds)']

lr_one_feat = LinearRegression()
lr_one_feat.fit(X, y)

LinearRegression()

LinearRegression()

lr_one_feat.intercept_, lr_one_feat.coef_

(np.float64(-82.5757430645409), array([3.08]))

root_mean_squared_error(y, lr_one_feat.predict(X))

np.float64(10.079113675632819)

people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm

X2 = people[['Height (Inches)', 'Height (cm)']]

lr_two_feat = LinearRegression()
lr_two_feat.fit(X2, y)

LinearRegression()

LinearRegression()

lr_two_feat.intercept_, lr_two_feat.coef_

(np.float64(-82.5751808693277), array([ 3.38e+10, -1.33e+10]))

root_mean_squared_error(y, lr_two_feat.predict(X2))

np.float64(10.07911285540873)

(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

# Note: The FunctionTransformer is only needed to change the result
# of the OneHotEncoder from a "sparse" matrix to a regular matrix
# so that it can be used with StandardScaler;
# it doesn't change anything mathematically.
pl = Pipeline([
    ("ohe", OneHotEncoder(drop="first")),
    ("ft", FunctionTransformer(lambda X: X.toarray())),
    ("ss", StandardScaler())
])

np.random.seed(23) # For reproducibility.

def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures

# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])

# Look at the definition of train_and_plot in lec15_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25], data_name='Sample 1')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.model_selection import train_test_split

# Read the documentation!
train_test_split?

Signature:
train_test_split(
    *arrays,
    test_size=None,
    train_size=None,
    random_state=None,
    shuffle=True,
    stratify=None,
)
Docstring:
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None, the value is set to the
    complement of the train size. If ``train_size`` is also None, it will
    be set to 0.25.

train_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the
    proportion of the dataset to include in the train split. If
    int, represents the absolute number of train samples. If None,
    the value is automatically set to the complement of the test size.

random_state : int, RandomState instance or None, default=None
    Controls the shuffling applied to the data before applying the split.
    Pass an int for reproducible output across multiple function calls.
    See :term:`Glossary <random_state>`.

shuffle : bool, default=True
    Whether or not to shuffle the data before splitting. If shuffle=False
    then stratify must be None.

stratify : array-like, default=None
    If not None, data is split in a stratified fashion, using this as
    the class labels.
    Read more in the :ref:`User Guide <stratification>`.

Returns
-------
splitting : list, length=2 * len(arrays)
    List containing train-test split of inputs.

    .. versionadded:: 0.16
        If the input is sparse, the output will be a
        ``scipy.sparse.csr_matrix``. Else, output type is the same as the
        input type.

Examples
--------
>>> import numpy as np
>>> from sklearn.model_selection import train_test_split
>>> X, y = np.arange(10).reshape((5, 2)), range(5)
>>> X
array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])
>>> list(y)
[0, 1, 2, 3, 4]

>>> X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)
...
>>> X_train
array([[4, 5],
       [0, 1],
       [6, 7]])
>>> y_train
[2, 0, 3]
>>> X_test
array([[2, 3],
       [8, 9]])
>>> y_test
[1, 4]

>>> train_test_split(y, shuffle=False)
[[0, 1, 2], [3, 4]]
File:      ~/miniforge3/envs/dsc80/lib/python3.11/site-packages/sklearn/model_selection/_split.py
Type:      function

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # We don't have to choose 0.25.

print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 195

Rows in X_test: 49

X_train.shape[0] / tips.shape[0]

0.7991803278688525

tips.head()

X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips.
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random_state is like np.random.seed.

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

LinearRegression()

pred_train = lr.predict(X_train)
rmse_train = root_mean_squared_error(y_train, pred_train)
rmse_train

np.float64(0.9803205287924736)

pred_test = lr.predict(X_test)
rmse_test = root_mean_squared_error(y_test, pred_test)
rmse_test

np.float64(1.1381771291131253)

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

px.scatter(sample_1, x='x', y='y', title='Sample 1')

X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

train_errs = []
test_errs = []

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(root_mean_squared_error(y_train, pl.predict(X_train)))
    test_errs.append(root_mean_squared_error(y_test, pl.predict(X_test)))

errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})

fig = px.line(errs)
fig.update_layout(showlegend=True, xaxis_title='Polynomial Degree', yaxis_title='RMSE')

	Height (Inches)	Weight (Pounds)
0	65.78	112.99
1	71.52	136.49
2	69.40	153.03
3	68.22	142.34
4	67.79	144.30

	sex	smoker	day	time
0	Female	No	Sun	Dinner
1	Male	No	Sun	Dinner
2	Male	No	Sun	Dinner
3	Male	No	Sun	Dinner
4	Female	No	Sun	Dinner

	total_bill	sex	smoker	day	time	size
0	16.99	Female	No	Sun	Dinner	2
1	10.34	Male	No	Sun	Dinner	3
2	21.01	Male	No	Sun	Dinner	3
3	23.68	Male	No	Sun	Dinner	2
4	24.59	Female	No	Sun	Dinner	4

	total_bill	sex	smoker	day	time	size
146	18.64	Female	No	Thur	Lunch	3
224	13.42	Male	Yes	Fri	Lunch	2
134	18.26	Female	No	Thur	Lunch	2
131	20.27	Female	No	Thur	Lunch	2
147	11.87	Female	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 15 – Pipelines, Multicollinearity, and Generalization¶

DSC 80, Spring 2025¶

Agenda 📆¶

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

💡 Pro-Tip: Using make_pipeline and make_column_transformer¶

An example Pipeline¶

The purpose of standardizing features¶

Aside: Pipelines of just transformers¶

Question 🤔 (Answer at dsc80.com/q)

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Example: One hot encoding¶

Key takeaways¶

Question 🤔 (Answer at dsc80.com/q)

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

The bias-variance decomposition¶

Navigating the bias-variance tradeoff¶

Question 🤔 (Answer at dsc80.com/q)

Train-test splits¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Summary, next time¶

Summary¶

Next time¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

💡 Pro-Tip: Using `make_pipeline` and `make_column_transformer`¶

An example `Pipeline`¶

Aside: `Pipeline`s of just transformers¶