from dsc80_utils import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

penguins['body_mass_g'].mean()

np.float64(4207.057057057057)

# ???

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
dtype: float64

# Before:
penguins['body_mass_g'].mean()

np.float64(4207.057057057057)

# After:
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

# Fill this in, then respond on dsc80.com/q

penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x29099d3d0>

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x29099f590>

penguin_groups.groups

{'Adelie': [0, 1], 'Chinstrap': [156, 157], 'Gentoo': [308, 258, 309]}

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species')['body_mass_g'].mean()

species
Adelie       3775.0
Chinstrap    3837.5
Gentoo       4925.0
Name: body_mass_g, dtype: float64

# Whoa, what happened in the sex column?
penguins_small.groupby('species').sum()

penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# Your code goes here.

# Back to the big penguins dataset!
penguins

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').sum()['bill_length_mm']

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

# Saves time!
penguins.groupby('species')['bill_length_mm'].sum()

%%timeit
penguins.groupby('species').sum()['bill_length_mm']

%%timeit
penguins.groupby('species')['bill_length_mm'].sum()

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

# Slower
penguins.groupby('species').sum()['bill_length_mm']

# Faster
penguins.groupby('species')['bill_length_mm'].sum()

(penguins
 .groupby('species')
 ['body_mass_g']
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a pd.Series and returns a scalar.

def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)

(penguins
 .groupby('species')
 ['body_mass_g']
 .agg(iqr)
)

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
       ... 
341    1.92
342    1.23
343    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
341    1.32
342    0.22
343    0.62
Name: body_mass_g, Length: 333, dtype: float64

penguins.assign(z_mass=z_mass)

display_df(penguins.assign(z_mass=z_mass), rows=8)

penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

(penguins
 .groupby('species')
 .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(penguins
 .groupby('species')
 .filter(lambda df: df.shape[0] > 100)
)

penguins

species_and_island = (
    penguins
    .groupby(['species', 'island'])
    [['bill_length_mm', 'body_mass_g']]
    .mean()
)
species_and_island

species_and_island

species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.66
           Dream        3701.36
           Torgersen    3708.51
Chinstrap  Dream        3733.09
Gentoo     Biscoe       5092.44
Name: body_mass_g, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm      39.04
body_mass_g       3708.51
Name: (Adelie, Torgersen), dtype: float64

species_and_island.reset_index()

(penguins
 .groupby(['species', 'island'], as_index=False)
 [['bill_length_mm', 'body_mass_g']]
 .mean()
)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

# Your code goes here.

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

last_5_years = baby.query('Year >= 2018')
last_5_years

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum',
)

# Look at the similarity to the snippet above!
(last_5_years
 .groupby(['Year', 'Sex'])
 [['Count']]
 .sum()
)

penguins

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm', # Choice of column here doesn't actually matter!
    aggfunc='count',
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm',
    aggfunc='count',
    fill_value=0,
)

counts = penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='count',
    fill_value=0
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows,
# which computes the sum of the **columns**.
joint.sum(axis=0)

joint.sum(axis=1)

counts

counts

counts.sum(axis=0)

counts / counts.sum(axis=0)

# Your code goes here.

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	body_mass_g	sex
species
Adelie	7550.0	MaleFemale
Chinstrap	7675.0	MaleFemale
Gentoo	14775.0	FemaleFemaleMale

	bill_length_mm	body_mass_g
island
Biscoe	38.98	3709.66
Dream	38.52	3701.36
Torgersen	39.04	3708.51

Lecture 3 – Aggregating¶

DSC 80, Spring 2025¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at dsc80.com/q)

Data granularity and the groupby method¶

Example: Palmer Penguins¶

Granularity¶

Aggregating¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

Question 🤔 (Answer at dsc80.com/q)

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Question 🤔 (Answer at dsc80.com/q)

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering groups¶

Question 🤔 (Answer at dsc80.com/q)

Grouping with multiple columns¶

Grouping and indexes¶

Question 🤔 (Answer at dsc80.com/q)

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Example¶

Reshaping¶

We will most likely end lecture here.

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Next time¶

Data granularity and the `groupby` method¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶

Other `DataFrameGroupBy` methods¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶