from dsc80_utils import *

from sklearn.datasets import load_breast_cancer
loaded = load_breast_cancer() # explore the value of `loaded`!
data = loaded['data']
labels = 1 - loaded['target']
cols = loaded['feature_names']
bc = pd.DataFrame(data, columns=cols)

bc.head()

labels

array([1, 1, 1, ..., 1, 1, 0])

pd.Series(labels).value_counts(normalize=True)

0    0.63
1    0.37
Name: proportion, dtype: float64

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(bc, labels)

clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

LogisticRegression(max_iter=10000)

clf.predict(X_test)

array([0, 1, 1, ..., 1, 0, 0])

# [:, 1] refers to the predicted probabilities for class 1.
clf.predict_proba(X_test)

array([[0.99, 0.01],
       [0.  , 1.  ],
       [0.01, 0.99],
       ...,
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ]])

clf.intercept_

array([-33.94])

clf.coef_

array([[-0.81, -0.26,  0.34, ...,  0.45,  0.48,  0.07]])

from sklearn import metrics

y_pred = clf.predict(X_test)

metrics.confusion_matrix(y_test, y_pred)

array([[79,  2],
       [ 7, 55]])

from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

metrics.accuracy_score(y_test, y_pred)

0.9370629370629371

metrics.precision_score(y_test, y_pred)

np.float64(0.9649122807017544)

metrics.recall_score(y_test, y_pred)

np.float64(0.8870967741935484)

thresholds = np.arange(0.01, 1.01, 0.01)
precisions = np.array([])
recalls = np.array([])

for t in thresholds:
    y_pred = clf.predict_proba(X_test)[:, 1] >= t
    precisions = np.append(precisions, metrics.precision_score(y_test, y_pred, zero_division=1))
    recalls = np.append(recalls, metrics.recall_score(y_test, y_pred))

px.line(x=thresholds, y=precisions,
        labels={'x': 'Threshold', 'y': 'Precision'}, title='Precision vs. Threshold', width=1000, height=600)

px.line(x=thresholds, y=recalls, 
        labels={'x': 'Threshold', 'y': 'Recall'}, title='Recall vs. Threshold', width=1000, height=600)

px.line(x=recalls, y=precisions, hover_name=thresholds, 
        labels={'x': 'Recall', 'y': 'Precision'}, title='Precision vs. Recall')

pr = metrics.precision_score(y_test, clf.predict(X_test))
re = metrics.recall_score(y_test, clf.predict(X_test))

2 * pr * re / (pr + re)

np.float64(0.9243697478991597)

metrics.f1_score(y_test, clf.predict(X_test))

np.float64(0.9243697478991597)

metrics.accuracy_score(y_test, clf.predict(X_test))

0.9370629370629371

loans = pd.read_csv(Path('data') / 'loan_vars1.csv', index_col=0)
loans.head()

loans['loan_amnt'].sum()

np.float64(5706507225.0)

loans.shape[0]

386772

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = loans.drop('tag', axis=1)
y = loans.tag
X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50)

RandomForestClassifier(n_estimators=50)

y_pred = clf.predict(X_test)
y_pred

array([1., 0., 1., ..., 1., 0., 1.])

clf.score(X_test, y_test)

0.7144053861189538

ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

metrics.precision_score(y_test, y_pred)

np.float64(0.7733472857116631)

1 - metrics.precision_score(y_test, y_pred)

np.float64(0.2266527142883369)

metrics.recall_score(y_test, y_pred)

np.float64(0.7339617381607834)

1 - metrics.recall_score(y_test, y_pred)

np.float64(0.26603826183921664)

results = X_test
results['age_bracket'] = results['age'].apply(lambda x: 5 * (x // 5 + 1))
results['prediction'] = y_pred
results['tag'] = y_test

(
    results
    .groupby('age_bracket')
    [['tag', 'prediction']]
    .apply(lambda x: 1 - metrics.recall_score(x['tag'], x['prediction']))
    .plot(kind='bar', title='False Negative Rate by Age Group')
)

results['is_young'] = (results['age'] < 25).replace({True: 'young', False: 'old'})

results.groupby('is_young')['prediction'].mean()

is_young
old      0.68
young    0.30
Name: prediction, dtype: float64

compute_accuracy = lambda x: metrics.accuracy_score(x['tag'], x['prediction'])

(
    results
    .groupby('is_young')
    [['tag', 'prediction']]
    .apply(compute_accuracy)
    .rename('accuracy')
)

is_young
old      0.73
young    0.68
Name: accuracy, dtype: float64

obs = (results
       .groupby('is_young')
       [['tag', 'prediction']]
       .apply(compute_accuracy)
       .diff()
       .iloc[-1])
obs

np.float64(-0.04748870207680067)

diff_in_acc = []
for _ in range(500):
    s = (
        results[['is_young', 'prediction', 'tag']]
        .assign(is_young=np.random.permutation(results['is_young']))
        .groupby('is_young')
        [['tag', 'prediction']]
        .apply(compute_accuracy)
        .diff()
        .iloc[-1]
    )
    
    diff_in_acc.append(s)

fig = pd.Series(diff_in_acc).plot(kind='hist', histnorm='probability', nbins=20,
                            title='Difference in Accuracy (Young - Old)')
fig.add_vline(x=obs, line_color='red')
fig.update_layout(xaxis_range=[-0.1, 0.05])

loans

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	mean radius	mean texture	mean perimeter	mean area	...	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	...	0.71	0.27	0.46	0.12
1	20.57	17.77	132.90	1326.0	...	0.24	0.19	0.28	0.09
2	19.69	21.25	130.00	1203.0	...	0.45	0.24	0.36	0.09
3	11.42	20.38	77.58	386.1	...	0.69	0.26	0.66	0.17
4	20.29	14.34	135.10	1297.0	...	0.40	0.16	0.24	0.08

	loan_amnt	emp_length	home_ownership	inq_last_6mths	revol_bal	age
268309	6400.0	0.0	1.0	1.0	899.0	22.0
301093	10700.0	10.0	1.0	0.0	29411.0	19.0
1379211	15000.0	10.0	1.0	2.0	9911.0	48.0
486795	15000.0	10.0	1.0	2.0	15883.0	35.0
1481134	22775.0	3.0	1.0	0.0	17008.0	39.0

	loan_amnt	emp_length	home_ownership	inq_last_6mths	revol_bal	age	tag
268309	6400.0	0.0	1.0	1.0	899.0	22.0	0.0
301093	10700.0	10.0	1.0	0.0	29411.0	19.0	0.0
1379211	15000.0	10.0	1.0	2.0	9911.0	48.0	0.0
...	...	...	...	...	...	...	...
1150493	5000.0	1.0	1.0	0.0	3842.0	52.0	1.0
686485	6000.0	10.0	0.0	0.0	6529.0	36.0	1.0
342901	15000.0	8.0	1.0	1.0	16060.0	39.0	1.0

Lecture 18 – Classifier Evaluation, Model Fairness, Career Advice¶

DSC 80, Spring 2025¶

Announcements 📣¶

Final Exam 📝¶

Agenda 📆¶

Recap: Random Forests¶

Main idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Recap: Classifier Evaluation¶

Precision and recall¶

Logistic regression¶

Wisconsin breast cancer dataset¶

Logistic regression¶

Fitting a logistic regression model¶

Evaluating our model¶

What if we choose a different threshold?¶

Trying several thresholds¶

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

Model fairness¶

Fairness: why do we care?¶

Model fairness¶

Parity measures for classifiers¶

More on parity measures¶

Example: Loan approval¶

Predicting 'tag'¶

Precision¶

Recall¶

False negative rate by age¶

Computing parity measures¶

Is this difference in accuracy significant?¶

Ethical questions of fairness¶

Question 🤔 (Answer at dsc80.com/q)

Summary¶

Summary¶

Parting Thoughts¶

Course goals ✅¶

Course outcomes ✅¶

Topics covered ✅¶

Thank you!¶

Building a Data Science Career¶

Do Grades Matter?¶

An Alternate Route¶

There are many ways to be excellent¶

My Advice for People Starting Out¶

Let's Be Pragmatic¶

How to cold-email and actually get responses¶

Shifting your mentality¶

Be persistent!¶

Good luck on the Final Exam, and enjoy your summer break! 🎉

Predicting `'tag'`¶