In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

# Carryover setup from last lecture
import seaborn as sns
tips = sns.load_dataset('tips')

from sklearn.linear_model import LinearRegression

import util

import warnings
warnings.simplefilter('ignore')

Lecture 23 – Cross-Validation¶

DSC 80, Spring 2023¶

Agenda¶

  • Generalization.
  • Train-test split.
  • Hyperparameters.
  • Cross-validation.

Generalization¶

Recall, last time, we drew two samples from the same data generating process, and fit polynomials of degree 1, 3, and 25 on each sample.

In [2]:
np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()

When trained on sample 1, the degree 25 polynomial had the lowest RMSE on sample 1.

In [3]:
# Look at the definition of train_and_plot in util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')