import pandas as pd
import numpy as np
import os
import util
import plotly.express as px
import plotly.figure_factory as ff
pd.options.plotting.backend = 'plotly'
heights = pd.read_csv(os.path.join('data', 'midparent.csv'))
heights = (
heights
.rename(columns={'childHeight': 'child', 'childNum': 'number'})
.drop('midparentHeight', axis=1)
)
heights.head()
family | father | mother | children | number | gender | child | |
---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 4 | 1 | male | 73.5 |
np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = util.make_mcar(heights, 'child', pct=0.5)
heights_mar = util.make_mar_on_cat(heights, 'child', 'gender', pct=0.5)
Suppose the 'child'
column has missing values.
'child'
is MCAR, then fill in each of the missing values using the mean of the observed values.'child'
is MAR dependent on a categorical column, then fill in each of the missing values using the mean of the observed values in each category. For instance, if 'child'
is MAR dependent on 'gender'
, we can fill in:'child'
heights with the observed mean for female children, and'child'
heights with the observed mean for male children.'child'
is MAR dependent on a numerical column, then bin the numerical column to make it categorical, then follow the procedure above. See Lab 5, Question 5!def mean_impute(ser):
return ser.fillna(ser.mean())
heights_mar_cond = heights_mar.groupby('gender')['child'].transform(mean_impute).to_frame() # Conditional mean imputation (good, since MAR).
heights_mar_mfilled = heights_mar.fillna(heights_mar['child'].mean()) # Single mean imputation (bad, since MAR).
df_map = {'Original': heights, 'MAR, Unfilled': heights_mar,
'MAR, Mean Imputed': heights_mar_mfilled, 'MAR, Conditional Mean Imputed': heights_mar_cond}
util.multiple_kdes(df_map)