from dsc80_utils import *

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

penguins['body_mass_g'].mean()

# ???

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

# Mean body_mass_g for each species

penguins.groupby('species')['bill_length_mm'].mean()

penguins.groupby('species')

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

penguin_groups.groups

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species')['body_mass_g'].mean()

# Whoa, what happened in the sex column?
penguins_small.groupby('species').sum()

penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# The Adelie penguin with a body mass of 3800g is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# Back to the big penguins dataset!
penguins

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').sum()['bill_length_mm']

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

# Saves time!
penguins.groupby('species')['bill_length_mm'].sum()

%%timeit
penguins.groupby('species').sum()['bill_length_mm']

%%timeit
penguins.groupby('species')['bill_length_mm'].sum()

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

# Slower
penguins.groupby('species').sum()['bill_length_mm']

# Faster
penguins.groupby('species')['bill_length_mm'].sum()

(penguins
 .groupby('species')
 ['body_mass_g']
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a pd.Series and returns a scalar.

def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)

(penguins
 .groupby('species')
 ['body_mass_g']
 .agg(iqr)
)

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

display_df(penguins.assign(z_mass=z_mass), rows=8)

penguins.groupby('species')['body_mass_g'].mean()

(penguins
 .groupby('species')
 .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(penguins
 .groupby('species')
 .filter(lambda df: df.shape[0] > 100)
)

penguins

species_and_island = (
    penguins
    .groupby(['species', 'island'])
    [['bill_length_mm', 'body_mass_g']]
    .mean()
)
species_and_island

species_and_island

species_and_island['body_mass_g']

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

species_and_island.reset_index()

(penguins
 .groupby(['species', 'island'], as_index=False)
 [['bill_length_mm', 'body_mass_g']]
 .mean()
)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

last_5_years = baby.query('Year >= 2018')
last_5_years

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum',
)

# Look at the similarity to the snippet above!
(last_5_years
 .groupby(['Year', 'Sex'])
 [['Count']]
 .sum()
)

penguins

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm', # Choice of column here doesn't actually matter!
    aggfunc='count',
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm',
    aggfunc='count',
    fill_value=0,
)

counts = penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='count',
    fill_value=0
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows,
# which computes the sum of the **columns**.
joint.sum(axis=0)

joint.sum(axis=1)

counts

counts

counts.sum(axis=0)

counts / counts.sum(axis=0)

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

Lecture 3 – Aggregating¶

DSC 80, Winter 2026¶

Announcements 📣¶

Agenda¶

Data granularity and the groupby method¶

Example: Palmer Penguins¶

Granularity¶

Aggregating¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

Question 🤔

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Question 🤔

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering groups¶

Question 🤔

Grouping with multiple columns¶

MultiIndex¶

Question 🤔

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Example¶

Reshaping¶

We will most likely end lecture here.

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔

Summary, next time¶

Summary¶

Next time¶

Data granularity and the `groupby` method¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶

Other `DataFrameGroupBy` methods¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶