from dsc80_utils import *

diabetes = pd.read_csv(Path('data') / 'diabetes.csv')
display_df(diabetes, cols=9)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2, criterion='entropy')

dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

from sklearn.tree import plot_tree

plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=False);

from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}

len(hyperparameters['max_depth']) * \
len(hyperparameters['min_samples_split']) * \
len(hyperparameters['criterion'])

140

searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)

%%time
searcher.fit(X_train, y_train)

CPU times: user 888 ms, sys: 9.78 ms, total: 898 ms
Wall time: 898 ms

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

searcher.cv_results_['mean_test_score'] # Array of length 140.

array([0.73, 0.73, 0.73, ..., 0.75, 0.74, 0.72])

# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

final_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

# Training accuracy.
final_tree.score(X_train, y_train)

0.7881944444444444

# Testing accuracy.
# A bit lower than the `dt` tree we fit above!
final_tree.score(X_test, y_test)

0.765625

searcher.score(X_train, y_train)

0.7881944444444444

searcher.score(X_test, y_test)

0.765625

# Let's use more features for prediction
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes.drop(columns=['Outcome']), diabetes['Outcome'], random_state=1)
)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

clf.score(X_test, y_test)

0.796875

dt = DecisionTreeClassifier(max_depth=4, criterion='gini')
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.8142361111111112

dt.score(X_test, y_test)

0.7864583333333334

news = pd.read_csv('data/fake_news_training.csv')
news

news['label'].value_counts(normalize=True)

label
real    0.55
fake    0.45
Name: proportion, dtype: float64

from sklearn.feature_extraction.text import CountVectorizer

example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']

count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()

CountVectorizer()

count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}

count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])

example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']

pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X = news['content']
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(
        max_depth=3,
        n_estimators=100, # Uses 100 separate decision trees!
        random_state=42,
    )) 
])

pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=3, random_state=42)

# Training accuracy.
pl.score(X_train, y_train)

0.7555555555555555

# Testing accuracy.
pl.score(X_test, y_test)

0.6746987951807228

y_train.value_counts(normalize=True)

label
real    0.53
fake    0.47
Name: proportion, dtype: float64

# Distribution of predicted ys in the training set:

# stops scientific notation for pandas
pd.set_option('display.float_format', '{:.3f}'.format)
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

fake   0.697
real   0.303
Name: proportion, dtype: float64

# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.

hyperparameters = {
    'clf__max_depth': np.arange(2, 200, 20)
}

%%time

# Takes a few seconds to run – how many trees are being trained?
from sklearn.model_selection import GridSearchCV
grids = GridSearchCV(
    pl,
    n_jobs=-1, # Use multiple processors to parallelize
    param_grid=hyperparameters,
    return_train_score=True
)
grids.fit(X_train, y_train)

CPU times: user 1.07 s, sys: 271 ms, total: 1.34 s
Wall time: 5.37 s

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf',
                 RandomForestClassifier(max_depth=np.int64(42),
                                        random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=np.int64(42), random_state=42)

grids.best_params_

{'clf__max_depth': np.int64(42)}

# Training accuracy.
grids.score(X_train, y_train)

0.997979797979798

# Testing accuracy.
grids.score(X_test, y_test)

0.8313253012048193

index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']

pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
    xaxis_title='max_depth', yaxis_title='Accuracy'
)

hyperparameters = {
    'n_estimators': [10, 100, 1000], # number of trees per forest
    'max_depth': [None, 100, 10]     # max depth of each tree
}
grids = GridSearchCV(
    RandomForestClassifier(), param_grid=hyperparameters,
    cv=3, # 3-fold cross-validation
)
grids.fit(X_train, y_train)

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

	0	1	2	3	...	136	137	138	139
0	0.71	0.71	0.71	0.71	...	0.67	0.70	0.71	0.73
1	0.77	0.77	0.77	0.77	...	0.82	0.83	0.77	0.76
2	0.74	0.74	0.74	0.74	...	0.68	0.72	0.74	0.73
3	0.70	0.70	0.70	0.70	...	0.77	0.79	0.76	0.70
4	0.72	0.72	0.72	0.72	...	0.70	0.71	0.72	0.70

	baseurl	content	label
0	twitter.com	\njavascript is not available.\n\nwe’ve detect...	real
1	whitehouse.gov	remarks by the president at campaign event -- ...	real
2	web.archive.org	the committee on energy and commerce\nbarton: ...	real
...	...	...	...
658	politico.com	full text: jeff flake on trump speech transcri...	fake
659	pol.moveon.org	moveon.org political action: 10 things to know...	real
660	uspostman.com	uspostman.com is for sale\nyes, you can transf...	fake

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

Lecture 17 – Random Forests, Classifier Evaluation¶

DSC 80, Spring 2026¶

Agenda¶

Recap: Decision Trees¶

Example: Predicting diabetes¶

Exploring the dataset¶

Building a decision tree¶

Visualizing decision trees¶

Hyperparameters for decision trees¶

Brief aside: Thinking about bias and variance¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Decision tree pros and cons¶

Random Forests¶

Another idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Question 🤔 (Answer at dsc80.com/q)

Example¶

Example: Modeling using text features¶

Example: Fake news¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Question 🤔 (Answer at dsc80.com/q)

Classifier Evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶