import pandas as pd
import numpy as np
import os
import seaborn as sns
import plotly.express as px
pd.options.plotting.backend = 'plotly'
Consider the penguins
dataset from a few lectures ago.
penguins = sns.load_dataset('penguins').dropna()
penguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | Male |
penguins.groupby('island')['bill_length_mm'].agg(['mean', 'count'])
mean | count | |
---|---|---|
island | ||
Biscoe | 45.248466 | 163 |
Dream | 44.221951 | 123 |
Torgersen | 39.038298 | 47 |
It appears that penguins on Torgersen Island have shorter bills on average than penguins on other islands.
Again, while you could do this with a for
-loop (and you can use a for
-loop for hypothesis tests in labs and projects), we'll use the faster size
approach here.
Instead of using np.random.multinomial
, which samples from a categorical distribution, we'll use np.random.choice
, which samples from a known sequence of values.
# Draws two samples of size 47 from penguins['bill_length_mm'].
# Question: Why must we sample with replacement here (or, more specifically, in the next cell)?
np.random.choice(penguins['bill_length_mm'], size=(2, 47))
array([[39. , 43.8, 51.5, 52. , 37.6, 50.5, 41.6, 36.2, 51. , 37.6, 36.4, 37.2, 52. , 43.2, 39.2, 38.5, 40.3, 42.7, 50.2, 48.5, 49.5, 41.1, 45.2, 46.1, 35.5, 37.6, 52.5, 49.5, 39.6, 51.5, 50.4, 40.5, 44. , 44.5, 45.1, 50.2, 45.2, 41.4, 48.8, 40.5, 50.6, 50. , 40.8, 39.2, 47.2, 51.3, 46.8], [42.5, 44.5, 36.6, 45.1, 54.2, 49.4, 36.2, 37.9, 45.2, 49.3, 38.1, 36.4, 52.2, 39.7, 37.7, 44.9, 52. , 37. , 50.8, 45.1, 47.5, 42.3, 41.3, 46.8, 42.3, 41. , 55.8, 42.2, 38.8, 48.2, 34.6, 52.7, 39. , 45.4, 41.1, 39.8, 36.4, 46.4, 40.9, 37.7, 37.2, 45.5, 42.8, 37.8, 46.4, 52. , 45. ]])
# Draws 100000 samples of size 47 from penguins['bill_length_mm'].
num_reps = 100_000
averages = np.random.choice(penguins['bill_length_mm'], size=(num_reps, 47)).mean(axis=1)
averages
array([43.9893617 , 43.31914894, 44.25319149, ..., 43.76382979, 43.5787234 , 43.04680851])
fig = px.histogram(pd.DataFrame(averages), x=0, nbins=50, histnorm='probability',
title='Empirical Distribution of the Average Bill Length in Samples of Size 47')
fig.add_vline(x=penguins.loc[penguins['island'] == 'Torgersen', 'bill_length_mm'].mean(), line_color='red')