import numpy as np
import pandas as pd
import os
import plotly.express as px
pd.options.plotting.backend = 'plotly'
baby_fp = os.path.join('data', 'baby.csv')
baby = pd.read_csv(baby_fp)
baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
3 | True | 108 |
4 | False | 136 |
Recall our permutation test from last class:
We'll use 3000 repetitions instead of 500.
%%time
n_repetitions = 3000
differences = []
for _ in range(n_repetitions):
# Step 1: Shuffle the weights and store them in a DataFrame.
with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
# Step 2: Compute the test statistic.
# Remember, alphabetically, False comes before True,
# so this computes True - False.
group_means = (
with_shuffled
.groupby('Maternal Smoker')
.mean()
.loc[:, 'Shuffled_Weights']
)
difference = group_means.diff().iloc[-1]
# Step 3: Store the result.
differences.append(difference)
CPU times: user 2.46 s, sys: 1.71 ms, total: 2.46 s Wall time: 2.46 s
fig = px.histogram(pd.DataFrame(differences), x=0, nbins=50, histnorm='probability',
title='Empirical Distribution of the Test Statistic, Original Approach')
fig.update_layout(xaxis_range=[-5, 5])