import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'
import seaborn as sns
from sklearn.linear_model import LinearRegression
# The dataset is built into plotly (and seaborn)!
# We shuffle here so that the head of the DataFrame contains rows where smoker is Yes and smoker is No,
# purely for illustration purposes (it doesn't change any of the math).
np.random.seed(1)
tips = px.data.tips().sample(frac=1).reset_index(drop=True)
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 |
Let's suppose we choose squared loss, meaning that $h^* = \text{mean}(y)$.
mean_tip = tips['tip'].mean()
mean_tip
2.99827868852459
# Unfortunately, the code to visualize a scatter plot and a line
# in plotly is not all that concise.
fig = go.Figure()
fig.add_trace(go.Scatter(
x=tips['total_bill'],
y=tips['tip'],
mode='markers',
name='Original Data')
)
fig.add_trace(go.Scatter(
x=[0, 60],
y=[mean_tip, mean_tip],
mode='lines',
name='Constant Prediction (Mean)'
))
fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
xaxis_title='Total Bill', yaxis_title='Tip',
template=TEMPLATE)
fig.update_xaxes(range=[0, 60])