from dsc80_utils import *

names = np.load(Path('data') / 'names.npy', allow_pickle=True)

# By default, the sampling is done WITH replacement.
np.random.choice(names, 10)

# To sample WITHOUT replacement, set replace=False.
# This is known as "simple random sampling."
np.random.choice(names, 10, replace=False)

# Samples WITHOUT replacement by default (the opposite of np.random.choice).
pd.DataFrame(names, columns=['name']).sample(10)

# Draws 100 elements from a population in which 50% are group 0 and 50% are group 1.
# This sampling is done WITH replacement.
# In other words, each sampled element has a 50% chance of being group 0 and a 50% chance of being group 1.
np.random.multinomial(100, [0.5, 0.5])

eth = pd.DataFrame(
    [['Asian', 0.15, 0.51],
     ['Black', 0.05, 0.02],
     ['Latino', 0.39, 0.16],
     ['White', 0.35, 0.2],
     ['Other', 0.06, 0.11]],
    columns=['Ethnicity', 'California', 'UCSD']
).set_index('Ethnicity')

eth

eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD', barmode='group')

def tvd(dist1, dist2):
    return np.abs(dist1 - dist2).sum() / 2

# The diff method finds the differences of consecutive elements in a Series.
pd.Series([4, 5, -2]).diff()

observed_tvd = eth.diff(axis=1).abs().sum().iloc[1] / 2
observed_tvd

# Number of students at UCSD in this example.
N_STUDENTS = 30_000

eth['California']

Ethnicity
Asian     0.15
Black     0.05
Latino    0.39
White     0.35
Other     0.06
Name: California, dtype: float64

np.random.multinomial(N_STUDENTS, eth['California'])

array([ 4512,  1481, 11670, 10590,  1747])

np.random.multinomial(N_STUDENTS, eth['California']) / N_STUDENTS

array([0.15, 0.05, 0.39, 0.35, 0.06])

eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
eth_draws

array([[0.15, 0.05, 0.39, 0.34, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       ...,
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06]])

eth_draws.shape

(100000, 5)

# The values here appear rounded.
tvds = np.abs(eth_draws - eth['California'].to_numpy()).sum(axis=1) / 2
tvds

array([0.01, 0.  , 0.  , ..., 0.  , 0.  , 0.  ])

observed_tvd

np.float64(0.41000000000000003)

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=20, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig

(np.array(tvds) >= observed_tvd).mean()

np.float64(0.0)

eth

With 2 students, the p-value is 0.72723.
With 4 students, the p-value is 0.30721.
With 8 students, the p-value is 0.08161.
With 16 students, the p-value is 0.00382.
With 32 students, the p-value is 2e-05.
With 64 students, the p-value is 0.0.
With 128 students, the p-value is 0.0.
With 256 students, the p-value is 0.0.

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

np.float64(-9.266142572024918)

[np.float64(1.5836586071880134),
 np.float64(-0.555537273184342),
 np.float64(-1.4570044334750207),
 np.float64(1.190161037219852),
 np.float64(1.3618690677514138),
 np.float64(0.22788061611591104),
 np.float64(-0.4088699971052989),
 np.float64(0.635687188628367),
 np.float64(1.6265856148208968),
 np.float64(0.22430336547984098)]

np.float64(-9.266142572024918)

count
2    1033
1       2
Name: count, dtype: int64

mar_status
married      1484
unmarried     584
dtype: int64

np.float64(0.1269754089281099)

np.float64(0.1269754089281099)

eth

def ethnicity_test(N_STUDENTS):
    eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
    tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
    return (np.array(tvds) >= observed_tvd).mean()

for i in range(1, 9):
    N_STUDENTS = 2 ** i
    print(f'With {N_STUDENTS} students, the p-value is {ethnicity_test(N_STUDENTS)}.')

With 2 students, the p-value is 0.72723.
With 4 students, the p-value is 0.30721.
With 8 students, the p-value is 0.08161.
With 16 students, the p-value is 0.00382.
With 32 students, the p-value is 2e-05.
With 64 students, the p-value is 0.0.
With 128 students, the p-value is 0.0.
With 256 students, the p-value is 0.0.

baby = pd.read_csv(Path('data') / 'babyweights.csv')
baby

baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()

baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

group_means.loc[True] - group_means.loc[False]

np.float64(-9.266142572024918)

baby.head()

np.random.permutation(baby['Birth Weight'])

with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()

group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means

for x in ['Birth Weight', 'Shuffled_Weights']:
    diff = group_means.loc[True, x] - group_means.loc[False, x]
    fig = px.histogram(
        with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
        title=f"Using the {x} column <br>(difference in means = {diff:.2f})",
        barmode='overlay', opacity=0.7)
    fig.update_layout(margin=dict(t=60))
    fig.show()

n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, False (0) comes before True (1),
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.loc[True] - group_means.loc[False]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[np.float64(1.5836586071880134),
 np.float64(-0.555537273184342),
 np.float64(-1.4570044334750207),
 np.float64(1.190161037219852),
 np.float64(1.3618690677514138),
 np.float64(0.22788061611591104),
 np.float64(-0.4088699971052989),
 np.float64(0.635687188628367),
 np.float64(1.6265856148208968),
 np.float64(0.22430336547984098)]

mean_weights = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
observed_difference = mean_weights[True] - mean_weights[False]
observed_difference

np.float64(-9.266142572024918)

fig = px.histogram(
    pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
    title='Empirical Distribution of the Mean Differences <br> in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-10, 10], margin=dict(t=60))

couples = pd.read_csv(Path('data') / 'married_couples.csv')
couples.head()

# What does this expression compute?
couples['hh_id'].value_counts().value_counts()

count
2    1033
1       2
Name: count, dtype: int64

couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()

couples.head()

empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]

couples = couples.replace({
    'mar_status': {1: 'married', 2: 'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})

couples.head()

# For categorical columns, this shows the 10 most common values and their frequencies.
# For numerical columns, this shows the result of calling the .describe() method.
for col in couples:
    if couples[col].dtype == 'object':
        empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    else:
        empr = couples[col].describe().to_frame()
    display(empr)

px.histogram(couples, x='age', color='mar_status', histnorm='probability', marginal='box',
             barmode='overlay', opacity=0.7)

couples.sample(5).head()

# Note that this is a shortcut to picking a column for values and using aggfunc='count'.
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts

empl_cnts.sum()

mar_status
married      1484
unmarried     584
dtype: int64

cond_distr = empl_cnts / empl_cnts.sum()
cond_distr

cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type', barmode='group')

cond_distr

(cond_distr['unmarried'] - cond_distr['married']).abs().sum() / 2

np.float64(0.1269754089281099)

def tvd_of_groups(df, groups, cats):
    '''groups: the binary column (e.g. married vs. unmarried).
       cats: the categorical column (e.g. employment status).
    '''
    cnts = df.pivot_table(index=cats, columns=groups, aggfunc='size')
    # Normalize each column.
    distr = cnts / cnts.sum()
    # Compute and return the TVD.
    return (distr['unmarried'] - distr['married']).abs().sum() / 2

# Same result as above.
observed_tvd = tvd_of_groups(couples, groups='mar_status', cats='empl_status')
observed_tvd

np.float64(0.1269754089281099)

couples.head()

couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))

N = 1000
tvds = []

for _ in range(N):
    # Shuffle marital statuses.
    with_shuffled = couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))
    
    # Compute and store the TVD.
    tvd = tvd_of_groups(with_shuffled, groups='shuffled_mar', cats='empl_status')
    tvds.append(tvd)

fig = px.histogram(tvds, x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.update_layout(xaxis_range=[0, 0.2])

	age
count	2068.00
mean	43.17
std	11.91
...	...
50%	44.00
75%	53.00
max	64.00

	California	UCSD
Ethnicity
Asian	0.15	0.51
Black	0.05	0.02
Latino	0.39	0.16
White	0.35	0.20
Other	0.06	0.11

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
...	...	...	...	...	...	...
1171	130	291	30	65	150	True
1172	125	281	21	65	110	False
1173	117	297	38	65	129	False

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

	mar_status	empl_status	gender	age
0	married	Working as paid employee	M	51
1	married	Working as paid employee	F	53
2	married	Working as paid employee	M	57
3	married	Working as paid employee	F	57
4	married	Working as paid employee	M	60

	proportion
empl_status
Working as paid employee	0.61
Not working - other	0.10
Working, self-employed	0.10
Not working - looking for work	0.07
Not working - disabled	0.06
Not working - retired	0.05
Not working - on a temporary layoff from a job	0.02

	mar_status	empl_status	gender	age
1598	married	Not working - on a temporary layoff from a job	M	33
1132	married	Working as paid employee	M	54
495	married	Working, self-employed	F	57
1650	married	Not working - disabled	M	49
1279	married	Working as paid employee	F	45

	proportion
gender
M	0.5
F	0.5

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

Lecture 6 – EDA Part 2, Hypothesis Testing¶

DSC 80, Winter 2025¶

Announcements 📣¶

Agenda 📆¶

Hypothesis Testing¶

Why are we learning hypothesis testing again?¶

Data scope¶

Where are we in the data science lifecycle?¶

Data scope¶

Example: Wikipedia awards¶

Example: Who will win the election?¶

🔑 Key Idea: Random samples look like the access frame they were sampled from!¶

Sampling in practice¶

Overview of hypothesis testing¶

What problem does hypothesis testing solve?¶

Why hypothesis testing is difficult to learn¶

The hypothesis testing "recipe"¶

Question 🤔 (Answer at dsc80.com/q)

Example: Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Generating many random samples and computing TVDs, without a for-loop¶

Visualizing the empirical distribution of the test statistic¶

Conclusion¶

Summary of the method¶

Aside¶

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Hypothesis testing vs. permutation testing¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Permutation testing meets TVD¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it. You can also watch this podcast, starting from 4:43 for a walkthrough.

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

Summary, next time¶

Summary¶

Next time¶

Generating many random samples and computing TVDs, without a `for`-loop¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it.

You can also watch this podcast, starting from 4:43 for a walkthrough.

Understanding the `couples` dataset¶

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1