import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'
# Carryover setup from last lecture
import seaborn as sns
tips = sns.load_dataset('tips')
from sklearn.linear_model import LinearRegression
import util
import warnings
warnings.simplefilter('ignore')
Recall, last time, we drew two samples from the same data generating process, and fit polynomials of degree 1, 3, and 25 on each sample.
np.random.seed(23) # For reproducibility.
def sample_dgp(n=100):
x = np.linspace(-2, 3, n)
y = x ** 3 + (np.random.normal(0, 3, size=n))
return pd.DataFrame({'x': x, 'y': y})
sample_1 = sample_dgp()
sample_2 = sample_dgp()
When trained on sample 1, the degree 25 polynomial had the lowest RMSE on sample 1.
# Look at the definition of train_and_plot in util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')