from dsc80_utils import *
import lec15_util as util

tips = px.data.tips()

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])

pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])

pl.predict(tips_cat.iloc[:5])

array([3.1 , 3.27, 3.27, 3.27, 3.1 ])

pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}

pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

pl.named_steps['one-hot'].get_feature_names_out()

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

pl.named_steps['lin-reg'].coef_

array([-0.09,  0.09, -0.04,  0.04, -0.2 , -0.13,  0.14,  0.19,  0.25,
       -0.25])

# Why is this so low?
pl.score(tips_cat, tips['tip'])

0.027496790201475663

from sklearn.compose import ColumnTransformer

tips_features = tips.drop('tip', axis=1)
tips_features.head()

from sklearn.preprocessing import Binarizer

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])

pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

tips_features.head()

# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

array([2.74, 2.32, 3.37, 3.37, 3.75])

from sklearn.preprocessing import FunctionTransformer

f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

array([1.  , 1.41, 1.73])

# Old code

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' 
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

preproc = make_column_transformer(
    (Binarizer(threshold=2), ['size']),
    (OneHotEncoder(), ['sex', 'smoker', 'day', 'time']),
    remainder='passthrough',
)

pl = make_pipeline(preproc, LinearRegression())
# Notice that the steps in the pipeline and column transformer are
# automatically named
pl

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('linearregression', LinearRegression())])

# Let's define these once, since we'll use them repeatedly.
X = tips[['total_bill', 'size']]
y = tips['tip']

from sklearn.preprocessing import StandardScaler

model_with_std = make_pipeline(
    StandardScaler(),
    LinearRegression(),
)


model_with_std.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

model_with_std.score(X, y)

0.46786930879612587

from sklearn.metrics import mean_squared_error

mean_squared_error(y, model_with_std.predict(X), squared=False)

1.007256127114662

model_without_std = LinearRegression()
model_without_std.fit(X, y)

LinearRegression()

model_without_std.score(X, y)

0.46786930879612587

mean_squared_error(y, model_without_std.predict(X), squared=False)

1.007256127114662

# Total bill, table size.
model_without_std.coef_

array([0.09, 0.19])

# Total bill, table size.
model_with_std.named_steps['linearregression'].coef_

array([0.82, 0.18])

def is_weekend(s):
    # The input to is_weekend is a Series!
    return s.replace({'Thur': 'Weekday', 'Fri': 'Weekday'})

pl_day = make_pipeline(
    FunctionTransformer(is_weekend),
    OneHotEncoder(),
)

col_trans = make_column_transformer(
    (pl_day, ['day']),
    (OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    (Binarizer(threshold=2), ['size']),
    remainder='passthrough')

pl = make_pipeline(
    col_trans,
    LinearRegression(),
)

pl.fit(tips.drop('tip', axis=1), tips['tip'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('functiontransformer',
                                                                   FunctionTransformer(func=<function is_weekend at 0x7f96590a9ee0>)),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  ['day']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'time']),
                                                 ('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size'])])),
                ('linearregression', LinearRegression())])

people_path = Path('data') / 'SOCR-HeightWeight.csv'
people = pd.read_csv(people_path).drop(columns=['Index'])
people.head()

people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds')

X = people[['Height (Inches)']]
y = people['Weight (Pounds)']

lr_one_feat = LinearRegression()
lr_one_feat.fit(X, y)

LinearRegression()

lr_one_feat.intercept_, lr_one_feat.coef_

(-82.57574306454099, array([3.08]))

mean_squared_error(y, lr_one_feat.predict(X), squared=False)

10.079113675632819

people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm

X2 = people[['Height (Inches)', 'Height (cm)']]

lr_two_feat = LinearRegression()
lr_two_feat.fit(X2, y)

LinearRegression()

lr_two_feat.intercept_, lr_two_feat.coef_

(-82.57585227669999, array([-2.46e+10,  9.67e+09]))

mean_squared_error(y, lr_two_feat.predict(X2), squared=False)

10.079113005376787

(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

# Note: The FunctionTransformer is only needed to change the result
# of the OneHotEncoder from a "sparse" matrix to a regular matrix
# so that it can be used with StandardScaler;
# it doesn't change anything mathematically.
pl = Pipeline([
    ("ohe", OneHotEncoder(drop="first")),
    ("ft", FunctionTransformer(lambda X: X.toarray())),
    ("ss", StandardScaler())
])

np.random.seed(23) # For reproducibility.

def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures

# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])

# Look at the definition of train_and_plot in lec15_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25], data_name='Sample 1')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.model_selection import train_test_split

# Read the documentation!
train_test_split?

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # We don't have to choose 0.25.

print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 195

Rows in X_test: 49

X_train.shape[0] / tips.shape[0]

0.7991803278688525

tips.head()

X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips.
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random_state is like np.random.seed.

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared=False)
rmse_train

0.9803205287924737

pred_test = lr.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared=False)
rmse_test

1.138177129113125

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

px.scatter(sample_1, x='x', y='y', title='Sample 1')

X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

train_errs = []
test_errs = []

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(mean_squared_error(y_train, pl.predict(X_train), squared=False))
    test_errs.append(mean_squared_error(y_test, pl.predict(X_test), squared=False))

errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})

fig = px.line(errs)
fig.update_layout(showlegend=True, xaxis_title='Polynomial Degree', yaxis_title='RMSE')

	Height (Inches)	Weight (Pounds)
0	65.78	112.99
1	71.52	136.49
2	69.40	153.03
3	68.22	142.34
4	67.79	144.30

	sex	smoker	day	time
0	Female	No	Sun	Dinner
1	Male	No	Sun	Dinner
2	Male	No	Sun	Dinner
3	Male	No	Sun	Dinner
4	Female	No	Sun	Dinner

	total_bill	sex	smoker	day	time	size
0	16.99	Female	No	Sun	Dinner	2
1	10.34	Male	No	Sun	Dinner	3
2	21.01	Male	No	Sun	Dinner	3
3	23.68	Male	No	Sun	Dinner	2
4	24.59	Female	No	Sun	Dinner	4

	total_bill	sex	smoker	day	time	size
146	18.64	Female	No	Thur	Lunch	3
224	13.42	Male	Yes	Fri	Lunch	2
134	18.26	Female	No	Thur	Lunch	2
131	20.27	Female	No	Thur	Lunch	2
147	11.87	Female	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	size	x0_Female	x0_Male	x1_No	x2_Sun	x3_Dinner	total_bill
0	0	1.0	0.0	1.0	1.0	1.0	16.99
1	1	0.0	1.0	1.0	1.0	1.0	10.34
2	1	0.0	1.0	1.0	1.0	1.0	21.01
3	0	0.0	1.0	1.0	1.0	1.0	23.68
4	1	1.0	0.0	1.0	1.0	1.0	24.59

Lecture 15 – Pipelines, Multicollinearity, and Generalization¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

💡 Pro-Tip: Using make_pipeline and make_column_transformer¶

An example Pipeline¶

The purpose of standardizing features¶

Aside: Pipelines of just transformers¶

Question 🤔 (Answer at q.dsc80.com)

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Example: One hot encoding¶

Key takeaways¶

Question 🤔 (Answer at q.dsc80.com)

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Risk vs. empirical risk¶

The bias-variance decomposition¶

Navigating the bias-variance tradeoff¶

Question 🤔 (Answer at q.dsc80.com)

Train-test splits¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Summary, next time¶

Summary¶

Next time¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

💡 Pro-Tip: Using `make_pipeline` and `make_column_transformer`¶

An example `Pipeline`¶

Aside: `Pipeline`s of just transformers¶