from dsc80_utils import *

names = np.load(Path('data') / 'names.npy', allow_pickle=True)

# By default, the sampling is done WITH replacement.
np.random.choice(names, 10)

# To sample WITHOUT replacement, set replace=False.
# This is known as "simple random sampling."
np.random.choice(names, 10, replace=False)

# Samples WITHOUT replacement by default (the opposite of np.random.choice).
pd.DataFrame(names, columns=['name']).sample(10)

# Draws 100 elements from a population in which 50% are group 0 and 50% are group 1.
# This sampling is done WITH replacement.
# In other words, each sampled element has a 50% chance of being group 0 and a 50% chance of being group 1.
np.random.multinomial(100, [0.5, 0.5])

eth = pd.DataFrame(
    [['Asian', 0.15, 0.51],
     ['Black', 0.05, 0.02],
     ['Latino', 0.39, 0.16],
     ['White', 0.35, 0.2],
     ['Other', 0.06, 0.11]],
    columns=['Ethnicity', 'California', 'UCSD']
).set_index('Ethnicity')

eth

eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD', barmode='group')

def tvd(dist1, dist2):
    return np.abs(dist1 - dist2).sum() / 2

# The diff method finds the differences of consecutive elements in a Series.
pd.Series([4, 5, -2]).diff()

observed_tvd = eth.diff(axis=1).abs().sum().iloc[1] / 2
observed_tvd

# Number of students at UCSD in this example.
N_STUDENTS = 30_000

eth['California']

np.random.multinomial(N_STUDENTS, eth['California'])

np.random.multinomial(N_STUDENTS, eth['California']) / N_STUDENTS

eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
eth_draws

eth_draws.shape

# The values here appear rounded.
tvds = np.abs(eth_draws - eth['California'].to_numpy()).sum(axis=1) / 2
tvds

observed_tvd

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=20, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig

(np.array(tvds) >= observed_tvd).mean()

eth

eth

def ethnicity_test(N_STUDENTS):
    eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
    tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
    return (np.array(tvds) >= observed_tvd).mean()

for i in range(1, 9):
    N_STUDENTS = 2 ** i
    print(f'With {N_STUDENTS} students, the p-value is {ethnicity_test(N_STUDENTS)}.')

baby = pd.read_csv(Path('data') / 'babyweights.csv')
baby

baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()

baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

group_means.loc[True] - group_means.loc[False]

baby.head()

np.random.permutation(baby['Birth Weight'])

with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()

group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means

for x in ['Birth Weight', 'Shuffled_Weights']:
    diff = group_means.loc[True, x] - group_means.loc[False, x]
    fig = px.histogram(
        with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
        title=f"Using the {x} column <br>(difference in means = {diff:.2f})",
        barmode='overlay', opacity=0.7)
    fig.update_layout(margin=dict(t=60))
    fig.show()

n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, False (0) comes before True (1),
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.loc[True] - group_means.loc[False]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

mean_weights = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
observed_difference = mean_weights[True] - mean_weights[False]
observed_difference

fig = px.histogram(
    pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
    title='Empirical Distribution of the Mean Differences <br> in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-10, 10], margin=dict(t=60))

couples = pd.read_csv(Path('data') / 'married_couples.csv')
couples.head()

# What does this expression compute?
couples['hh_id'].value_counts().value_counts()

couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()

couples.head()

empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]

couples = couples.replace({
    'mar_status': {1: 'married', 2: 'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})

couples.head()

# For categorical columns, this shows the 10 most common values and their frequencies.
# For numerical columns, this shows the result of calling the .describe() method.
for col in couples:
    if couples[col].dtype == 'object':
        empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    else:
        empr = couples[col].describe().to_frame()
    display(empr)

px.histogram(couples, x='age', color='mar_status', histnorm='probability', marginal='box',
             barmode='overlay', opacity=0.7)

couples.sample(5).head()

# Note that this is a shortcut to picking a column for values and using aggfunc='count'.
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts

empl_cnts.sum()

cond_distr = empl_cnts / empl_cnts.sum()
cond_distr

cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type', barmode='group')

cond_distr

(cond_distr['unmarried'] - cond_distr['married']).abs().sum() / 2

def tvd_of_groups(df, groups, cats):
    '''groups: the binary column (e.g. married vs. unmarried).
       cats: the categorical column (e.g. employment status).
    '''
    cnts = df.pivot_table(index=cats, columns=groups, aggfunc='size')
    # Normalize each column.
    distr = cnts / cnts.sum()
    # Compute and return the TVD.
    return (distr['unmarried'] - distr['married']).abs().sum() / 2

# Same result as above.
observed_tvd = tvd_of_groups(couples, groups='mar_status', cats='empl_status')
observed_tvd

couples.head()

couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))

N = 1000
tvds = []

for _ in range(N):
    # Shuffle marital statuses.
    with_shuffled = couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))
    
    # Compute and store the TVD.
    tvd = tvd_of_groups(with_shuffled, groups='shuffled_mar', cats='empl_status')
    tvds.append(tvd)

fig = px.histogram(tvds, x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.update_layout(xaxis_range=[0, 0.2])

Lecture 6 – EDA Part 2, Hypothesis Testing¶

DSC 80, Spring 2025¶

Agenda 📆¶

Hypothesis Testing¶

Why are we learning hypothesis testing again?¶

Data scope¶

Where are we in the data science lifecycle?¶

Data scope¶

Example: Wikipedia awards¶

Example: Who will win the election?¶

🔑 Key Idea: Random samples look like the access frame they were sampled from!¶

Sampling in practice¶

Overview of hypothesis testing¶

What problem does hypothesis testing solve?¶

Why hypothesis testing is difficult to learn¶

The hypothesis testing "recipe"¶

Question 🤔 (Answer at dsc80.com/q)

Example: Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Generating many random samples and computing TVDs, without a for-loop¶

Visualizing the empirical distribution of the test statistic¶

Conclusion¶

Summary of the method¶

Aside¶

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Hypothesis testing vs. permutation testing¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Permutation testing meets TVD¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it. You can also watch this podcast, starting from 4:43 for a walkthrough.

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

Summary, next time¶

Summary¶

Next time¶

Generating many random samples and computing TVDs, without a `for`-loop¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it.

You can also watch this podcast, starting from 4:43 for a walkthrough.

Understanding the `couples` dataset¶