from dsc80_utils import *

# The dataset is built into plotly (and seaborn)!
# We shuffle here so that the head of the DataFrame contains rows where smoker is Yes and smoker is No,
# purely for illustration purposes (it doesn't change any of the math).
np.random.seed(1)
tips = px.data.tips().sample(frac=1).reset_index(drop=True)

tips

mean_tip = tips['tip'].mean()

mean_tip

np.float64(2.99827868852459)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(tips[['total_bill']], y=tips['tip'])

model_two = LinearRegression()
model_two.fit(tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()

LinearRegression()

def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)

rmse_dict['one feature: total bill'] = rmse(tips['tip'], model.predict(tips[['total_bill']]))

rmse_dict['two features'] = rmse(tips['tip'], model_two.predict(tips[['total_bill', 'size']]))

pd.DataFrame({'rmse': rmse_dict.values()}, index=rmse_dict.keys())

model.score(tips[['total_bill']], tips['tip'], )

0.45661658635167623

pred = tips.assign(predicted=model_two.predict(tips[['total_bill', 'size']]))
pred

np.var(pred['predicted']) / np.var(pred['tip'])

np.float64(0.46786930879612587)

pred[['predicted', 'tip']].corr().loc['predicted', 'tip'] ** 2

np.float64(0.4678693087961254)

model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612565

tips.head()

tips.head()

tips['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

(tips['smoker'] == 'Yes').astype(int).head()

0    1
1    0
2    1
3    0
4    0
Name: smoker, dtype: int64

for val in tips['smoker'].unique():
    tips[f'smoker == {val}'] = (tips['smoker'] == val).astype(int)

tips.head()

model_three = LinearRegression()
model_three.fit(tips[['total_bill', 'size', 'smoker == Yes']], tips['tip'])

LinearRegression()

LinearRegression()

model_three.intercept_, model_three.coef_

(np.float64(0.7090155167346057), array([ 0.09,  0.18, -0.08]))

# pio.renderers.default = 'plotly_mimetype+notebook' # If it doesn't render, try uncommenting this.

XX, YY = np.mgrid[0:50:2, 0:8:1]
Z_0 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 0
Z_1 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 1
plane_0 = go.Surface(x=XX, y=YY, z=Z_0, colorscale='Greens')
plane_1 = go.Surface(x=XX, y=YY, z=Z_1, colorscale='Purples')

fig = go.Figure(data=[plane_0, plane_1])

tips_0 = tips[tips['smoker'] == 'No']
tips_1 = tips[tips['smoker'] == 'Yes']

fig.add_trace(go.Scatter3d(x=tips_0['total_bill'], 
                           y=tips_0['size'], 
                           z=tips_0['tip'], mode='markers', marker = {'color': 'green'}))

fig.add_trace(go.Scatter3d(x=tips_1['total_bill'], 
                           y=tips_1['size'], 
                           z=tips_1['tip'], mode='markers', marker = {'color': 'purple'}))

fig.update_layout(scene = dict(
    xaxis_title='Total Bill',
    yaxis_title='Table Size',
    zaxis_title='Tip'),
  title='Tip vs. Total Bill and Table Size (Green = Non-Smoking Section, Purple = Smoking Section)',
    width=1000, height=800,
    showlegend=False)

fig = go.Figure()
fig.add_trace(go.Scatter(x=tips['total_bill'], y=tips['tip'], 
                         mode='markers', name='Original Data'))
fig.add_trace(go.Scatter(x=tips['total_bill'], y=model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]), 
                         mode='markers', name='Predicted Tips using Total Bill, <br>Table Size, and Smoker Status'))

fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

rmse_dict['three features'] = rmse(tips['tip'], 
                                   model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]))
rmse_dict

{'constant tip amount': np.float64(1.3807999538298952),
 'one feature: total bill': np.float64(1.0178504025697377),
 'two features': np.float64(1.007256127114662),
 'three features': np.float64(1.0064899786822128)}

tips.head()

ordinal_enc = {
    '✩': 1,
    '✩✩': 2,
    '✩✩✩': 3,
    '✩✩✩✩': 4,
    '✩✩✩✩✩': 5,
}
ordinal_enc

{'✩': 1, '✩✩': 2, '✩✩✩': 3, '✩✩✩✩': 4, '✩✩✩✩✩': 5}

ratings = pd.DataFrame().assign(rating=['✩', '✩✩', '✩✩✩', '✩✩', '✩✩✩', '✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩'])
ratings

ratings['rating'].map(ordinal_enc)

0    1
1    2
2    3
    ..
6    3
7    4
8    5
Name: rating, Length: 9, dtype: int64

mpg = sns.load_dataset('mpg').dropna()
mpg.head()

mpg['model_year'].value_counts()

model_year
73    40
78    36
76    34
      ..
71    27
80    27
74    26
Name: count, Length: 13, dtype: int64

px.scatter(mpg, x='horsepower', y='mpg')

car_model = LinearRegression()
car_model.fit(mpg[['horsepower']], mpg['mpg'])

LinearRegression()

LinearRegression()

hp_points = pd.DataFrame({'horsepower': [25, 225]})
fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(go.Scatter(
    x=hp_points['horsepower'],
    y=car_model.predict(hp_points),
    mode='lines',
    name='Predicted MPG using Horsepower'
))

res = mpg.assign(
    Predictions=car_model.predict(mpg[['horsepower']]),
    Residuals=mpg['mpg'] - car_model.predict(mpg[['horsepower']]),
)
fig = px.scatter(res, x='Predictions', y='Residuals')
fig.add_hline(0, line_width=3, opacity=1)

car_model.score(mpg[['horsepower']], mpg['mpg'])

0.6059482578894351

mpg['log hp'] = np.log(mpg['horsepower'])

px.scatter(mpg, x='log hp', y='mpg')

car_model_log = LinearRegression()
car_model_log.fit(mpg[['log hp']], mpg['mpg'])

LinearRegression()

LinearRegression()

fig = px.scatter(mpg, x='log hp', y='mpg')
log_hp_points = pd.DataFrame({'log hp': [3.7, 5.5]})
fig = px.scatter(mpg, x='log hp', y='mpg')
fig.add_trace(go.Scatter(
    x=log_hp_points['log hp'],
    y=car_model_log.predict(log_hp_points),
    mode='lines',
    name='Predicted MPG using log(Horsepower)'
))

car_model_log.score(mpg[['log hp']], mpg['mpg'])

0.6683347641192137

fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(
    go.Scatter(
        x=mpg['horsepower'], 
        y=car_model_log.intercept_ + car_model_log.coef_[0] * np.log(mpg['horsepower']),  
        mode='markers', name='Predicted MPG using log(Horsepower)'
    )
)
fig

car_model_log.intercept_, car_model_log.coef_

(np.float64(108.69970699574483), array([-18.58]))

tips.head()

from sklearn.preprocessing import Binarizer

tips['total_bill'].head()

0     3.07
1    18.78
2    26.59
3    14.26
4    21.16
Name: total_bill, dtype: float64

bi = Binarizer(threshold=20)

transformed_bills = bi.transform(tips[['total_bill']]) # Must give transform a 2D array/DataFrame.
transformed_bills[:5]

/Users/watson-parris/miniconda3/envs/dsc80/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning:

X has feature names, but Binarizer was fitted without feature names

array([[0.],
       [0.],
       [1.],
       [0.],
       [1.]])

tips_quant = tips[['total_bill', 'size']]
tips_quant.head()

from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()

stdscaler.transform(tips_quant)

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[57], line 1
----> 1 stdscaler.transform(tips_quant)

File ~/miniconda3/envs/dsc80/lib/python3.12/site-packages/sklearn/utils/_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

File ~/miniconda3/envs/dsc80/lib/python3.12/site-packages/sklearn/preprocessing/_data.py:1042, in StandardScaler.transform(self, X, copy)
   1027 def transform(self, X, copy=None):
   1028     """Perform standardization by centering and scaling.
   1029 
   1030     Parameters
   (...)
   1040         Transformed array.
   1041     """
-> 1042     check_is_fitted(self)
   1044     copy = copy if copy is not None else self.copy
   1045     X = self._validate_data(
   1046         X,
   1047         reset=False,
   (...)
   1052         force_all_finite="allow-nan",
   1053     )

File ~/miniconda3/envs/dsc80/lib/python3.12/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# This is like saying "determine the mean and SD of each column in tips_quant".
stdscaler.fit(tips_quant)

StandardScaler()

StandardScaler()

# First column is 'total_bill', second column is 'size'.
tips_quant_z = stdscaler.transform(tips_quant)
tips_quant_z[:5]

array([[-1.88, -1.65],
       [-0.11, -0.6 ],
       [ 0.77,  0.45],
       [-0.62, -0.6 ],
       [ 0.15, -0.6 ]])

stdscaler.mean_

array([19.79,  2.57])

stdscaler.var_

array([78.93,  0.9 ])

stdscaler.transform(tips_quant.sample(5))

array([[ 0.97, -0.6 ],
       [-0.28, -0.6 ],
       [ 1.19,  1.51],
       [-0.49, -0.6 ],
       [ 0.65,  1.51]])

stdscaler.fit_transform(tips_quant)

array([[-1.88, -1.65],
       [-0.11, -0.6 ],
       [ 0.77,  0.45],
       ...,
       [-0.26, -0.6 ],
       [-1.09, -0.6 ],
       [-0.32,  0.45]])

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ohe.fit(tips_cat)

OneHotEncoder()

OneHotEncoder()

ohe.transform(tips_cat)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 976 stored elements and shape (244, 10)>

ohe.transform(tips_cat).toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

ohe.get_feature_names_out() # x0, x1, x2, and x3 correspond to column names in tips_cat.

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

pd.DataFrame(ohe.transform(tips_cat).toarray(), 
             columns=ohe.get_feature_names_out()) # If we need a DataFrame back, for some reason.

	mpg	cylinders	displacement	horsepower	...	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	...	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	...	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	...	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	...	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	...	10.5	70	usa	ford torino

Property	Example	Description
Initialize with parameters	`binar = Binarizer(thresh)`	set x=1 if x > thresh, else 0
Transform data in a dataset	`feat = binar.transform(data)`	Binarize all columns in `data`

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`
Fit and transform	`stdscaler.fit_transform(X)`	Compute the mean and SD of `X`, then z-score `X`

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
...	...	...	...	...	...	...	...
241	17.47	3.50	Female	No	Thur	Lunch	2
242	10.07	1.25	Male	No	Sat	Dinner	2
243	16.93	3.07	Female	No	Sat	Dinner	3

	rmse
constant tip amount	1.38
one feature: total bill	1.02
two features	1.01

UID	AGE	STATE	HAS_BOUGHT	REVIEW	\|	RATING
74	32	NY	True	"Meh."	\|	✩✩
42	50	WA	True	"Worked out of the box..."	\|	✩✩✩✩
57	16	CA	NULL	"Sick af..."	\|	✩
...	...	...	...	...	\|	...
(int)	(int)	(str)	(bool)	(str)	\|	(str)

	rating
0	✩
1	✩✩
2	✩✩✩
...	...
6	✩✩✩
7	✩✩✩✩
8	✩✩✩✩✩

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0

Lecture 14 – Feature Engineering¶

DSC 80, Spring 2025¶

Agenda 📆¶

Review: Predicting tips 🧑‍🍳¶

Linear models¶

Root mean squared error¶

The .score method of a LinearRegression object¶

Aside: $R^2$¶

Calculating $R^2$¶

What's next?¶

Feature engineering ⚙️¶

The goal of feature engineering¶

One hot encoding¶

Example: One hot encoding 'smoker'¶

Model #4: Multiple linear regression using total bill, table size, and smoker status¶

Visualizing Model #4¶

Comparing Model #4 to earlier models¶

Reflection¶

Question 🤔 (Answer at dsc80.com/q)

Example: Predicting ratings ⭐️¶

Example: Predicting ratings ⭐️¶

Uninformative features¶

Dropping features¶

Encoding ordinal features¶

Encoding nominal features¶

Example: Horsepower 🚗¶

The relationship between 'horsepower' and 'mpg'¶

Predicting 'mpg' using 'horsepower'¶

Linearization¶

Predicting 'mpg' using log('horsepower')¶

Quantitative scaling¶

Question 🤔 (Answer at dsc80.com/q)

The modeling process¶

The modeling process¶

preprocessing and linear_models¶

Transformers in sklearn¶

Transformer classes¶

Example: Predicting tips 🧑‍🍳¶

Example transformer: Binarizer¶

Example transformer: StandardScaler¶

Example transformer: StandardScaler¶

💡 Pro-Tip: Using .fit_transform¶

StandardScaler summary¶

Example transformer: OneHotEncoder¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Next time¶

The `.score` method of a `LinearRegression` object¶

Example: One hot encoding `'smoker'`¶

The relationship between `'horsepower'` and `'mpg'`¶

Predicting `'mpg'` using `'horsepower'`¶

Predicting `'mpg'` using `log('horsepower')`¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Example transformer: `Binarizer`¶

Example transformer: `StandardScaler`¶

Example transformer: `StandardScaler`¶

💡 Pro-Tip: Using `.fit_transform`¶

`StandardScaler` summary¶

Example transformer: `OneHotEncoder`¶

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0