from dsc80_utils import *

from lec08_utils import *

heights_path = Path('data') / 'midparent.csv'
heights = pd.read_csv(heights_path).rename(columns={'childHeight': 'child'})[['father', 'mother', 'gender', 'child']]
heights.head()

np.random.seed(42) # So that we get the same results each time (for lecture).

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.nan

heights_mcar.head(10)

heights_mcar.isna().mean()

father    0.0
mother    0.0
gender    0.0
child     0.3
dtype: float64

heights_mcar['child_missing'] = heights_mcar['child'].isna()
heights_mcar.head()

gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height (MCAR Example)', barmode='group')

create_kde_plotly(heights_mcar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height (MCAR Example)")

create_kde_plotly(heights_mcar, 'child_missing', True, False, 'mother', 
                  "Mother's Height by Missingness of Child Height (MCAR Example)")

np.random.seed(42) # So that we get the same results each time (for lecture).

def make_missing(r):
    rand = np.random.uniform() # Random real number between 0 and 1.
    if r['father'] > 72 and rand < 0.5:
        return np.nan
    elif r['gender'] == 'female' and rand < 0.3:
        return np.nan
    else:
        return r['child']
    
heights_mar = heights.copy()
heights_mar['child'] = heights_mar.apply(make_missing, axis=1)
heights_mar['child_missing'] = heights_mar['child'].isna()

heights_mar.head()

gender_dist = (
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height (MAR Example)', barmode='group')

create_kde_plotly(heights_mar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height (MAR Example)")

(
    heights_mar
    .groupby('child_missing')
    ['father']
    .mean()
    .diff()
    .iloc[-1]
)

np.float64(1.0055466604787853)

np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
distr1 = pd.Series(np.random.normal(0, 1, size=N // 2))

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(3, 1, size=N // 2))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})

meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
a = pd.Series(np.random.normal(0, 1, size=N//2))
b = pd.Series(np.random.normal(4, 1, size=N//2))
distr1 = pd.concat([a,b], ignore_index=True)

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(distr1.mean(), distr1.std(), size=N))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})

meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

n_repetitions = 500
shuffled = data.copy()

diff_means = []
for _ in range(n_repetitions):
    
    # Shuffling the values, while keeping the group labels in place.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the absolute difference in means.
    diff_mean = shuffled.groupby('group')['data'].mean().diff().abs().iloc[-1]
    diff_means.append(diff_mean)

observed_diff = data.groupby('group')['data'].mean().diff().abs().iloc[-1]
fig = px.histogram(pd.DataFrame(diff_means), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Absolute Difference in Means')
fig.add_vline(x=observed_diff, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed Absolute Difference in Means = {round(observed_diff, 2)}</span>',
                   x=2 * observed_diff, showarrow=False, y=0.07)

# The computed p-value is fairly large.
np.mean(np.array(diff_means) >= observed_diff)

create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

fig1 = create_kde_plotly(data, 'group', 'A', 'B', 'data', f'Distributions of A and B')

# Think about what this function is doing!
def create_cdf(group):
    return data.loc[data['group'] == group, 'data'].value_counts(normalize=True).sort_index().cumsum()

fig2 = go.Figure()

fig2.add_trace(
    go.Scatter(x=create_cdf('A').index, y=create_cdf('A'), name='CDF of A')
)

fig2.add_trace(
    go.Scatter(x=create_cdf('B').index, y=create_cdf('B'), name='CDF of B')
)

fig2.update_layout(title='CDFs of A and B')

from plotly.subplots import make_subplots

for i in range(2):
    fig2.data[i]['marker']['color'] = fig1.data[i]['marker']['color']
    fig2.data[i]['showlegend'] = False
    
fig = make_subplots(rows=1, cols=2, subplot_titles=['Distributions', 'CDFs'])
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig1.data[1], row=1, col=1)
fig.add_trace(fig2.data[0], row=1, col=2)
fig.add_trace(fig2.data[1], row=1, col=2)
fig.update_layout(width=1000, height=400);

fig

from scipy.stats import ks_2samp

ks_2samp?

Signature:
ks_2samp(
    data1,
    data2,
    alternative='two-sided',
    method='auto',
    *,
    axis=0,
    nan_policy='propagate',
    keepdims=False,
)
Docstring:
Performs the two-sample Kolmogorov-Smirnov test for goodness of fit.

This test compares the underlying continuous distributions F(x) and G(x)
of two independent samples.  See Notes for a description of the available
null and alternative hypotheses.

Parameters
----------
data1, data2 : array_like, 1-Dimensional
    Two arrays of sample observations assumed to be drawn from a continuous
    distribution, sample sizes can be different.
alternative : {'two-sided', 'less', 'greater'}, optional
    Defines the null and alternative hypotheses. Default is 'two-sided'.
    Please see explanations in the Notes below.
method : {'auto', 'exact', 'asymp'}, optional
    Defines the method used for calculating the p-value.
    The following options are available (default is 'auto'):
    
      * 'auto' : use 'exact' for small size arrays, 'asymp' for large
      * 'exact' : use exact distribution of test statistic
      * 'asymp' : use asymptotic distribution of test statistic
axis : int or None, default: 0
    If an int, the axis of the input along which to compute the statistic.
    The statistic of each axis-slice (e.g. row) of the input will appear in a
    corresponding element of the output.
    If ``None``, the input will be raveled before computing the statistic.
nan_policy : {'propagate', 'omit', 'raise'}
    Defines how to handle input NaNs.
    
    - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
      which the  statistic is computed, the corresponding entry of the output
      will be NaN.
    - ``omit``: NaNs will be omitted when performing the calculation.
      If insufficient data remains in the axis slice along which the
      statistic is computed, the corresponding entry of the output will be
      NaN.
    - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
keepdims : bool, default: False
    If this is set to True, the axes which are reduced are left
    in the result as dimensions with size one. With this option,
    the result will broadcast correctly against the input array.

Returns
-------
res: KstestResult
    An object containing attributes:
    
    statistic : float
        KS test statistic.
    pvalue : float
        One-tailed or two-tailed p-value.
    statistic_location : float
        Value from `data1` or `data2` corresponding with the KS statistic;
        i.e., the distance between the empirical distribution functions is
        measured at this observation.
    statistic_sign : int
        +1 if the empirical distribution function of `data1` exceeds
        the empirical distribution function of `data2` at
        `statistic_location`, otherwise -1.

See Also
--------

:func:`kstest`, :func:`ks_1samp`, :func:`epps_singleton_2samp`, :func:`anderson_ksamp`
    ..

Notes
-----
There are three options for the null and corresponding alternative
hypothesis that can be selected using the `alternative` parameter.

- `less`: The null hypothesis is that F(x) >= G(x) for all x; the
  alternative is that F(x) < G(x) for at least one x. The statistic
  is the magnitude of the minimum (most negative) difference between the
  empirical distribution functions of the samples.

- `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
  alternative is that F(x) > G(x) for at least one x. The statistic
  is the maximum (most positive) difference between the empirical
  distribution functions of the samples.

- `two-sided`: The null hypothesis is that the two distributions are
  identical, F(x)=G(x) for all x; the alternative is that they are not
  identical. The statistic is the maximum absolute difference between the
  empirical distribution functions of the samples.

Note that the alternative hypotheses describe the *CDFs* of the
underlying distributions, not the observed values of the data. For example,
suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
x1 tend to be less than those in x2.

If the KS statistic is large, then the p-value will be small, and this may
be taken as evidence against the null hypothesis in favor of the
alternative.

If ``method='exact'``, `ks_2samp` attempts to compute an exact p-value,
that is, the probability under the null hypothesis of obtaining a test
statistic value as extreme as the value computed from the data.
If ``method='asymp'``, the asymptotic Kolmogorov-Smirnov distribution is
used to compute an approximate p-value.
If ``method='auto'``, an exact p-value computation is attempted if both
sample sizes are less than 10000; otherwise, the asymptotic method is used.
In any case, if an exact p-value calculation is attempted and fails, a
warning will be emitted, and the asymptotic p-value will be returned.

The 'two-sided' 'exact' computation computes the complementary probability
and then subtracts from 1.  As such, the minimum probability it can return
is about 1e-16.  While the algorithm itself is exact, numerical
errors may accumulate for large sample sizes.   It is most suited to
situations in which one of the sample sizes is only a few thousand.

We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.

Beginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
code) are converted to ``np.ndarray`` before the calculation is performed. In
this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
masked array with ``mask=False``.

References
----------
.. [1] Hodges, J.L. Jr.,  "The Significance Probability of the Smirnov
       Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-486.

Examples
--------
Suppose we wish to test the null hypothesis that two samples were drawn
from the same distribution.
We choose a confidence level of 95%; that is, we will reject the null
hypothesis in favor of the alternative if the p-value is less than 0.05.

If the first sample were drawn from a uniform distribution and the second
were drawn from the standard normal, we would expect the null hypothesis
to be rejected.

>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng()
>>> sample1 = stats.uniform.rvs(size=100, random_state=rng)
>>> sample2 = stats.norm.rvs(size=110, random_state=rng)
>>> stats.ks_2samp(sample1, sample2)
KstestResult(statistic=0.5454545454545454,
             pvalue=7.37417839555191e-15,
             statistic_location=-0.014071496412861274,
             statistic_sign=-1)

Indeed, the p-value is lower than our threshold of 0.05, so we reject the
null hypothesis in favor of the default "two-sided" alternative: the data
were *not* drawn from the same distribution.

When both samples are drawn from the same distribution, we expect the data
to be consistent with the null hypothesis most of the time.

>>> sample1 = stats.norm.rvs(size=105, random_state=rng)
>>> sample2 = stats.norm.rvs(size=95, random_state=rng)
>>> stats.ks_2samp(sample1, sample2)
KstestResult(statistic=0.10927318295739348,
             pvalue=0.5438289009927495,
             statistic_location=-0.1670157701848795,
             statistic_sign=-1)

As expected, the p-value of 0.54 is not below our threshold of 0.05, so
we cannot reject the null hypothesis.

Suppose, however, that the first sample were drawn from
a normal distribution shifted toward greater values. In this case,
the cumulative density function (CDF) of the underlying distribution tends
to be *less* than the CDF underlying the second sample. Therefore, we would
expect the null hypothesis to be rejected with ``alternative='less'``:

>>> sample1 = stats.norm.rvs(size=105, loc=0.5, random_state=rng)
>>> stats.ks_2samp(sample1, sample2, alternative='less')
KstestResult(statistic=0.4055137844611529,
             pvalue=3.5474563068855554e-08,
             statistic_location=-0.13249370614972575,
             statistic_sign=-1)

and indeed, with p-value smaller than our threshold, we reject the null
hypothesis in favor of the alternative.
File:      ~/miniconda3/envs/dsc80/lib/python3.12/site-packages/scipy/stats/_stats_py.py
Type:      function

observed_ks = ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data']).statistic
observed_ks

np.float64(0.14)

n_repetitions = 500
shuffled = data.copy()

ks_stats = []
for _ in range(n_repetitions):
    
    # Shuffling the data.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the K-S statistic.
    groups = shuffled.groupby('group')['data']
    ks_stat = ks_2samp(groups.get_group('A'), groups.get_group('B')).statistic
    ks_stats.append(ks_stat)
    
ks_stats[:10]

[np.float64(0.037),
 np.float64(0.048),
 np.float64(0.04),
 np.float64(0.068),
 np.float64(0.045),
 np.float64(0.04),
 np.float64(0.042),
 np.float64(0.052),
 np.float64(0.019),
 np.float64(0.029)]

fig = px.histogram(pd.DataFrame(ks_stats), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the K-S Statistic')
fig.add_vline(x=observed_ks, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed KS = {round(observed_ks, 2)}</span>',
                   x=0.8 * observed_ks, showarrow=False, y=0.16)

fig.update_layout(xaxis_range=[0, 0.2])
fig.update_layout(yaxis_range=[0, 0.2])

np.mean(np.array(ks_stats) >= observed_ks)

np.float64(0.0)

ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data'])

KstestResult(statistic=np.float64(0.14), pvalue=np.float64(5.822752148022591e-09), statistic_location=np.float64(0.9755451271223592), statistic_sign=np.int8(1))

heights_mar['child_missing'] = heights_mar['child'].isna()
create_kde_plotly(heights_mar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MAR example)")

heights_mar

ks_2samp(heights_mar.query('child_missing')['father'], heights_mar.query('not child_missing')['father'])

KstestResult(statistic=np.float64(0.20676025834396874), pvalue=np.float64(1.1424922868036869e-05), statistic_location=np.float64(72.0), statistic_sign=np.int8(-1))

np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = make_mcar(heights, 'child', pct=0.5)
heights_mar = make_mar_on_cat(heights, 'child', 'gender', pct=0.5)

multiple_describe({
    'Original': heights,
    'MCAR': heights_mcar,
    'MAR': heights_mar
})

# Look in util.py to see how multiple_kdes is defined.
multiple_kdes({'Original': heights, 'MCAR, Unfilled': heights_mcar})

heights_mcar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64

heights_mcar_mfilled = heights_mcar.fillna(heights_mcar['child'].mean())
heights_mcar_mfilled['child'].head()

0    73.20
1    69.20
2    66.64
3    66.64
4    73.50
Name: child, dtype: float64

df_map = {'Original': heights, 'MCAR, Unfilled': heights_mcar, 'MCAR, Mean Imputed': heights_mcar_mfilled}
multiple_describe(df_map)

multiple_kdes(df_map)

multiple_kdes({'Original': heights, 'MAR, Unfilled': heights_mar})

heights_mar['child'].head()

heights_mar_mfilled = heights_mar.fillna(heights_mar['child'].mean())
heights_mar_mfilled['child'].head()

0    73.20
1    69.20
2    68.52
3    68.52
4    73.50
Name: child, dtype: float64

df_map = {'Original': heights, 'MAR, Unfilled': heights_mar, 'MAR, Mean Imputed': heights_mar_mfilled}
multiple_describe(df_map)

multiple_kdes(df_map)

pd.concat([
    heights.groupby('gender')['child'].mean().rename('Original'),
    heights_mar.groupby('gender')['child'].mean().rename('MAR, Unfilled'),
    heights_mar_mfilled.groupby('gender')['child'].mean().rename('MAR, Mean Imputed')
], axis=1).T

def mean_impute(s):
    return s.fillna(s.mean())

heights_mar_cond = heights_mar.groupby('gender')['child'].transform(mean_impute).to_frame()
heights_mar_cond['child'].head()

0    73.20
1    69.20
2    64.22
3    64.22
4    73.50
Name: child, dtype: float64

df_map['MAR, Conditional Mean Imputed'] = heights_mar_cond
multiple_kdes(df_map)

def prob_impute(s):
    s = s.copy()
    
    # Step 1: Find the number of missing child heights for that gender.
    num_null = s.isna().sum()
    
    # Step 2: Sample num_null observed child heights for that gender.
    fill_values = np.random.choice(s.dropna(), num_null)
    
    # Step 3: Fill in missing values and return ser.
    s[s.isna()] = fill_values
    return s

heights_mar_pfilled = heights_mar.copy()
heights_mar_pfilled['child'] = (
    heights_mar
    .groupby('gender')
    ['child']
    .transform(prob_impute)
)
heights_mar_pfilled['child'].head()

0    73.2
1    69.2
2    62.0
3    62.5
4    73.5
Name: child, dtype: float64

df_map['MAR, Conditionally Probabilistically Imputed'] = heights_mar_pfilled
multiple_kdes(df_map)

means = df.groupby('c2').mean().to_dict()
imputed = df['c1'].apply(lambda x: means[x] if np.isnan(x) else x)

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
...	...	...	...	...
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	Mean	Standard Deviation
Dataset
Original	66.75	3.58
MCAR, Unfilled	66.64	3.56
MCAR, Mean Imputed	66.64	2.52

gender	female	male
Original	64.10	69.23
MAR, Unfilled	64.22	69.28
MAR, Mean Imputed	67.85	69.14

	child_missing = False	child_missing = True
gender
female	0.49	0.48
male	0.51	0.52

	child_missing = False	child_missing = True
gender
female	0.4	0.88
male	0.6	0.12

Lecture 8 – Imputation¶

DSC 80, Winter 2025¶

Announcements 📣¶

Agenda 📆¶

Review: Missingness mechanisms¶

Flowchart¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Identifying missingness mechanisms in data¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Concluding that 'child' is MCAR¶

Simulating MAR data¶

Comparing null and non-null 'child' distributions for 'gender', again¶

Comparing null and non-null 'child' distributions for 'father', again¶

The Kolmogorov-Smirnov test statistic¶

Recap: Permutation tests¶

Difference in means¶

Different distributions with the same mean¶

Telling numerical distributions apart¶

The Kolmogorov-Smirnov test statistic¶

Aside: Cumulative distribution functions¶

Aside: Cumulative distribution functions¶

The K-S statistic in Python¶

ks_2samp¶

Difference in means vs. K-S statistic¶

Back to our Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Performing the test¶

Handling missing values¶

What do we do with missing data?¶

Solution 1: Dropping missing values¶

Listwise deletion¶

Listwise deletion¶

Solution 2: Imputation¶

Kinds of imputation¶

Mean imputation¶

Mean imputation¶

Example: Mean imputation in the MCAR heights dataset¶

Mean imputation of MCAR data¶

Mean imputation of MCAR data¶

Example: Mean imputation in the MAR heights dataset¶

Mean imputation of MAR data¶

Mean imputation of MAR data¶

Within-group (conditional) mean imputation¶

transform returns!¶

Conclusion: Imputation with single values¶

Probabilistic imputation¶

Imputing missing values using distributions¶

Example: Probabilistic imputation in the MAR heights dataset¶

Observations¶

Randomness¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary of imputation techniques¶

Summary: Listwise deletion¶

Summary: Mean imputation¶

Summary: Conditional mean imputation¶

Summary: Probabilistic imputation¶

Summary: Multiple imputation¶

Next time¶

Question 🤔 (Answer at dsc80.com/q)

Verifying that child heights are MCAR in `heights_mcar`¶

Concluding that `'child'` is MCAR¶

Comparing null and non-null `'child'` distributions for `'gender'`, again¶

Comparing null and non-null `'child'` distributions for `'father'`, again¶

`ks_2samp`¶

Back to our Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Mean imputation in the MCAR `heights` dataset¶

Example: Mean imputation in the MAR `heights` dataset¶

`transform` returns!¶

Example: Probabilistic imputation in the MAR `heights` dataset¶