from dsc80_utils import *
import lec16_util as util

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Binarizer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression

tips = px.data.tips()

def is_weekend(s):
    # The input to is_weekend is a Series!
    return s.replace({'Thur': 'Weekday', 'Fri': 'Weekday'})

pl_day = make_pipeline(
    FunctionTransformer(is_weekend),
    OneHotEncoder(),
)

col_trans = make_column_transformer(
    (pl_day, ['day']),
    (OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    (Binarizer(threshold=2), ['size']),
    remainder='passthrough',
    force_int_remainder_cols=False,
)

pl = make_pipeline(
    col_trans,
    LinearRegression(),
)

pl.fit(tips.drop('tip', axis=1), tips['tip'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('functiontransformer',
                                                                   FunctionTransformer(func=<function is_weekend at 0x16a399c60>)),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  ['day']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'time']),
                                                 ('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size'])])),
                ('linearregression', LinearRegression())])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('functiontransformer',
                                                                   FunctionTransformer(func=<function is_weekend at 0x16a399c60>)),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  ['day']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'time']),
                                                 ('binarizer',
                                                  Binarizer(threshold=2),
                                                  ['size'])])),
                ('linearregression', LinearRegression())])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('pipeline',
                                 Pipeline(steps=[('functiontransformer',
                                                  FunctionTransformer(func=<function is_weekend at 0x16a399c60>)),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 ['day']),
                                ('onehotencoder', OneHotEncoder(drop='first'),
                                 ['sex', 'smoker', 'time']),
                                ('binarizer', Binarizer(threshold=2),
                                 ['size'])])

['day']

FunctionTransformer(func=<function is_weekend at 0x16a399c60>)

OneHotEncoder()

['sex', 'smoker', 'time']

OneHotEncoder(drop='first')

['size']

Binarizer(threshold=2)

# Note: The FunctionTransformer is only needed to change the result
# of the OneHotEncoder from a "sparse" matrix to a regular matrix
# so that it can be used with StandardScaler;
# it doesn't change anything mathematically.
pl = Pipeline([
    ("ohe", OneHotEncoder(drop="first")),
    ("ft", FunctionTransformer(lambda X: X.toarray())),
    ("ss", StandardScaler())
])

np.random.seed(23) # For reproducibility.

def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.model_selection import train_test_split

X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

train_errs = []
test_errs = []

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(root_mean_squared_error(y_train, pl.predict(X_train)))
    test_errs.append(root_mean_squared_error(y_test, pl.predict(X_test)))

errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})

fig = px.line(errs)
fig.update_layout(showlegend=True, xaxis_title='Polynomial Degree', yaxis_title='RMSE')

from sklearn.model_selection import KFold

data = np.arange(10, 70, 10)
data

array([10, 20, 30, 40, 50, 60])

kfold = KFold(3, shuffle=True, random_state=1)
kfold

KFold(n_splits=3, random_state=1, shuffle=True)

for train, val in kfold.split(data):
    print(f'train: {data[train]}, validation: {data[val]}')

train: [10 40 50 60], validation: [20 30]
train: [20 30 40 60], validation: [10 50]
train: [10 20 30 50], validation: [40 60]

cross_val_score(estimator, X_train, y_train, cv)

from sklearn.model_selection import cross_val_score

errs_df = pd.DataFrame()

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    
    # The `scoring` argument is used to specify that we want to compute the RMSE; 
    # the default is R^2. It's called "neg" RMSE because, 
    # by default, sklearn likes to "maximize" scores, and maximizing -RMSE is the same
    # as minimizing RMSE.
    errs = cross_val_score(pl, sample_1[['x']], sample_1['y'], 
                           cv=5, scoring='neg_root_mean_squared_error')
    errs_df[f'Deg {d}'] = -errs # Negate to turn positive (sklearn computed negative RMSE).
    
errs_df.index = [f'Fold {i}' for i in range(1, 6)]
errs_df.index.name = 'Validation Fold'

errs_df

errs_df.mean().idxmin()

'Deg 3'

errs_df.idxmin(axis=1)

Validation Fold
Fold 1    Deg 1
Fold 2    Deg 6
Fold 3    Deg 8
Fold 4    Deg 3
Fold 5    Deg 3
dtype: object

px.scatter(sample_1, x='x', y='y', title='Sample 1')

# make_column_transformer is a shortcut for the ColumnTransformer class
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

tips = sns.load_dataset('tips')
tips.head()

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# A dictionary that maps names to Pipeline objects.
select = FunctionTransformer(lambda x: x)
pipes = {
    'total_bill only': make_pipeline(
        make_column_transformer( (select, ['total_bill']) ),
        LinearRegression(),
    ),
    'total_bill + size': make_pipeline(
        make_column_transformer( (select, ['total_bill', 'size']) ),
        LinearRegression(),
    ),
    'total_bill + size + OHE smoker': make_pipeline(
        make_column_transformer(
            (select, ['total_bill', 'size']),
            (OneHotEncoder(drop='first'), ['smoker']),
        ),
        LinearRegression(),
    ),
    'total_bill + size + OHE all': make_pipeline(
        make_column_transformer(
            (select, ['total_bill', 'size']),
            (OneHotEncoder(drop='first'), ['smoker', 'sex', 'time', 'day']),
        ),
        LinearRegression(),
    ),
}

pipe_df = pd.DataFrame()

for pipe in pipes:
    errs = cross_val_score(pipes[pipe], X_train, y_train,
                           cv=5, scoring='neg_root_mean_squared_error')
    pipe_df[pipe] = -errs
    
pipe_df.index = [f'Fold {i}' for i in range(1, 6)]
pipe_df.index.name = 'Validation Fold'

pipe_df

pipe_df.mean()

total_bill only                   1.00
total_bill + size                 0.99
total_bill + size + OHE smoker    0.99
total_bill + size + OHE all       1.01
dtype: float64

pipe_df.mean().idxmin()

'total_bill + size + OHE smoker'

diabetes = pd.read_csv(Path('data') / 'diabetes.csv')
display_df(diabetes, cols=9)

# 0 means no diabetes, 1 means yes diabetes.
diabetes['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2, criterion='entropy')

dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

from sklearn.tree import plot_tree

plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=False);

# Note that the left node at depth 2 has a `value` of [304, 78].
y_train[X_train.query('Glucose <= 129.5').index].value_counts()

Outcome
0    304
1     78
Name: count, dtype: int64

(dt.predict(X_train) == y_train).mean()

np.float64(0.765625)

# Training accuracy – same number as above
dt.score(X_train, y_train)

0.765625

# Testing accuracy
dt.score(X_test, y_test)

0.7760416666666666

def make_tree(X, y):
    if all points in y have the same label C:
        return Leaf(C)
    f = best splitting feature # e.g. Glucose or BMI
    v = best splitting value   # e.g. 129.5
    
    X_left, y_left   = X, y where (X[f] <= v)
    X_right, y_right = X, y where (X[f] > v)
    
    left  = make_tree(X_left, y_left)
    right = make_tree(X_right, y_right)
    
    return Node(f, v, left, right)
    
make_tree(X_train, y_train)

def entropy(node):
    props = pd.Series(list(node)).value_counts(normalize=True)
    return -sum(props * np.log2(props))

def weighted_entropy(yes_node, no_node):
    yes_entropy = entropy(yes_node)
    no_entropy = entropy(no_node)
    yes_weight = len(yes_node) / (len(yes_node) + len(no_node))
    return yes_weight * yes_entropy + (1 - yes_weight) * no_entropy

# Split A:
weighted_entropy("🟠🟠🟠🟠🟠🟠🔵", "🟠🟠🟠🟠🟠🟠🔵🔵🔵🔵🔵")

0.8375578764623786

# Split B:
weighted_entropy("🟠🟠🟠🟠🟠🟠🔵🔵🔵", "🟠🟠🟠🟠🟠🟠🔵🔵🔵")

0.9182958340544896

plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=True);

# The first node at depth 2 has an entropy of 0.73,
# both told to us above and verified here!
entropy([0] * 304 + [1] * 78)

0.7302263747422792

dt_no_max = DecisionTreeClassifier()
dt_no_max.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

dt_no_max.tree_.max_depth

22

dt_no_max.score(X_train, y_train)

0.9913194444444444

# Depth 2 tree.
dt.score(X_train, y_train)

0.765625

dt_no_max.score(X_test, y_test)

0.71875

# Depth 2 tree.
dt.score(X_test, y_test)

0.7760416666666666

fig

trees = {}
for d in [2, 4, 8]:
    trees[d] = DecisionTreeClassifier(max_depth=d, random_state=1)
    trees[d].fit(X_train, y_train)
    
    plt.figure(figsize=(15, 5), dpi=100)
    plot_tree(trees[d], feature_names=X_train.columns, class_names=['no db', 'yes db'], 
               filled=True, rounded=True, impurity=False)
    
    plt.show()

	Deg 1	Deg 2	Deg 3	Deg 4	...	Deg 22	Deg 23	Deg 24	Deg 25
Validation Fold
Fold 1	4.79	12.81	5.04	4.93	...	8.77e+06	6.57e+07	1.41e+08	5.90e+08
Fold 2	3.97	5.36	3.19	3.22	...	2.93e+01	7.85e+01	7.53e+01	3.13e+01
Fold 3	4.77	2.56	2.08	2.11	...	3.03e+01	3.09e+01	4.24e+01	3.72e+01
Fold 4	6.13	4.66	2.93	2.93	...	6.27e+00	3.33e+01	5.80e+01	9.69e+00
Fold 5	11.70	11.92	3.24	4.37	...	8.36e+06	2.28e+08	8.17e+08	6.63e+09

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	total_bill only	total_bill + size	total_bill + size + OHE smoker	total_bill + size + OHE all
Validation Fold
Fold 1	1.32	1.27	1.27	1.29
Fold 2	0.95	0.92	0.93	0.93
Fold 3	0.77	0.86	0.86	0.87
Fold 4	0.85	0.84	0.84	0.86
Fold 5	1.10	1.07	1.07	1.08

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

Lecture 16 – Hyperparameters, Cross-Validation, and Decision Trees¶

DSC 80, Fall 2024¶

Announcements 📣¶

Agenda 📆¶

Review: Pipelines¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Review: Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Cross-validation¶

Idea: A single validation set¶

A better idea: $k$-fold cross-validation¶

Creating folds in sklearn¶

$k$-fold cross-validation¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

Question 🤔 (Answer at dsc80.com/q)

Another example: Tips¶

Question 🤔 (Answer at dsc80.com/q)

Summary: Generalization¶

Question 🤔 (Answer at dsc80.com/q)

Decision trees 🌲¶

Example: Should I get groceries?¶

Example: Predicting diabetes¶

Exploring the dataset¶

Building a decision tree¶

Visualizing decision trees¶

Evaluating classifiers¶

Reflection¶

How are decision trees trained?¶

How do we measure the quality of a split?¶

Entropy¶

Example entropy calculation¶

Understanding entropy¶

Tree depth¶

Decision trees and overfitting¶

Hyperparameters for decision trees¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Next time¶

Creating folds in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶