This notebook serves to provide more examples of how to identify missingness mechanisms through data.
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.figure_factory as ff
pd.options.plotting.backend = 'plotly'
from scipy.stats import ks_2samp
# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
fig = ff.create_distplot(
hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
group_labels=[group1, group2],
show_rug=False, show_hist=False,
colors=['#ef553b', '#636efb'],
)
return fig.update_layout(title=title)
'vin'
number, 'car_make'
, 'car_year'
, and 'car_color'
.'car_color'
missing at random, dependent on 'car_year'
?'car_year'
similar when color is missing vs. not missing?Let's use a permutation test!
cars = pd.read_csv(os.path.join('data', 'cars.csv'))
cars.head()
vin | car_make | car_year | car_color | |
---|---|---|---|---|
0 | 3D7TT2CT8BG121773 | Audi | 2008.0 | Teal |
1 | SCBZB25E62C073475 | Audi | 1996.0 | Mauv |
2 | 1FT7W2A69EE682086 | NaN | NaN | Turquoise |
3 | 1B3AZ6JZ7AV582128 | Ford | 2010.0 | Goldenrod |
4 | 1GYUCGEF4AR632425 | Mazda | 1996.0 | Purple |
# Proportion of car colors missing.
cars['car_color'].isna().mean()
0.1542
cars['color_missing'] = cars['car_color'].isna()
cars.head()
vin | car_make | car_year | car_color | color_missing | |
---|---|---|---|---|---|
0 | 3D7TT2CT8BG121773 | Audi | 2008.0 | Teal | False |
1 | SCBZB25E62C073475 | Audi | 1996.0 | Mauv | False |
2 | 1FT7W2A69EE682086 | NaN | NaN | Turquoise | False |
3 | 1B3AZ6JZ7AV582128 | Ford | 2010.0 | Goldenrod | False |
4 | 1GYUCGEF4AR632425 | Mazda | 1996.0 | Purple | False |
(
cars
.pivot_table(index='car_year', columns='color_missing', values=None, aggfunc='size')
.fillna(0)
.apply(lambda x: x / x.sum())
.plot(title='Distribution of Car Years by Missingness of Color')
)