from dsc80_utils import *

def show_paradox_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vSbFSaxaYZ0NcgrgqZLvjhkjX-5MQzAITWAsEFZHnix3j1c0qN8Vd1rogTAQP7F7Nf5r-JWExnGey7h/embed?start=false&rm=minimal'
    width = 960
    height = 569
    display(IFrame(src, width, height))

# Pandas Tutor setup
!pip install pandas-tutor
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Requirement already satisfied: pandas-tutor in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (2.0.4)
Requirement already satisfied: pandas>=1.3 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas-tutor) (2.2.3)
Requirement already satisfied: mypy-extensions==0.4.3 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas-tutor) (0.4.3)
Requirement already satisfied: typing-extensions<5.0,>=4.1 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas-tutor) (4.12.2)
Requirement already satisfied: numpy>=1.26.0 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas>=1.3->pandas-tutor) (2.1.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas>=1.3->pandas-tutor) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas>=1.3->pandas-tutor) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from pandas>=1.3->pandas-tutor) (2025.1)
Requirement already satisfied: six>=1.5 in /Users/tauhidur/miniforge3/envs/dsc80/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=1.3->pandas-tutor) (1.17.0)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

species_and_island = (
    penguins
    .groupby(['species', 'island'])
    [['bill_length_mm', 'body_mass_g']]
    .mean()
)
species_and_island

species_and_island

species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.66
           Dream        3701.36
           Torgersen    3708.51
Chinstrap  Dream        3733.09
Gentoo     Biscoe       5092.44
Name: body_mass_g, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm      39.04
body_mass_g       3708.51
Name: (Adelie, Torgersen), dtype: float64

species_and_island.reset_index()

(penguins
 .groupby(['species', 'island'], as_index=False)
 [['bill_length_mm', 'body_mass_g']]
 .mean()
)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

# Your code goes here.

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm', # Choice of column here doesn't actually matter!
    aggfunc='count',
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm',
    aggfunc='count',
    fill_value=0,
)

# Look at the similarity to the snippet above!
(penguins
 .groupby(['species', 'island'])
 [['bill_length_mm']]
 .count()
)

penguins = sns.load_dataset('penguins').dropna()
penguins

counts = penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='size',
    fill_value=0,
)
counts

counts.sum()

sex
Female    165
Male      168
dtype: int64

joint = counts/counts.sum().sum()
joint

# Recall, joint.sum(axis=0) sums across the rows,
# which computes the sum of the **columns**.
joint.sum(axis=0)

sex
Female    0.5
Male      0.5
dtype: float64

joint.sum(axis=1)

species
Adelie       0.44
Chinstrap    0.20
Gentoo       0.36
dtype: float64

counts

counts.sum(axis=0)

sex
Female    165
Male      168
dtype: int64

counts / counts.sum(axis=0)

lisa = pd.DataFrame([[20, 46], [18, 54], [5, 20]],
    columns=['Units', 'Grade Points Earned'],
    index=['Fall', 'Winter', 'Spring'],
)
lisa.columns.name = 'Lisa' # This allows us to see the name "Lisa" in the top left of the DataFrame.

bart = pd.DataFrame([[5, 10], [5, 13.5], [22, 81.4]],
    columns=['Units', 'Grade Points Earned'],
    index=['Fall', 'Winter', 'Spring'],
)
bart.columns.name = 'Bart'

dfs_side_by_side(lisa, bart)

quarterly_gpas = pd.DataFrame({
    "Lisa's Quarter GPA": lisa['Grade Points Earned'] / lisa['Units'],
    "Bart's Quarter GPA": bart['Grade Points Earned'] / bart['Units'],
})

quarterly_gpas

# Helper function to show lisa and bart side-by-side to save screen space
dfs_side_by_side(lisa, bart)

(quarterly_gpas
 .assign(Lisa_Units=lisa['Units'],
         Bart_Units=bart['Units'])
 .iloc[:, [0, 2, 1, 3]]
)

show_paradox_slides()

IFrame('https://www.youtube-nocookie.com/embed/zeuW1Z2EtLs?si=l2Dl7P-5RCq3ODpo',
       width=800, height=450)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

nyt_path = Path('data') / 'nyt_names.csv'
nyt = pd.read_csv(nyt_path)
nyt

nyt_small = nyt.iloc[[11, 12, 14]].reset_index(drop=True)

names_to_keep = ['Julius', 'Karen', 'Noah']
baby_small = (baby
 .query("Year == 2020 and Name in @names_to_keep")
 .reset_index(drop=True)
)

dfs_side_by_side(baby_small, nyt_small)

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
# Note the NaNs!
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='left')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='right')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='outer')

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'dsc80', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['dsc80', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

df1['a'] + df2['b']

awesome      NaN
dsc80       12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

# Run this cell to set up the next example.
profs = pd.DataFrame(
[['Sam', 'UCB', 5],
 ['Sam', 'UCSD', 5],
 ['Janine', 'UCSD', 8],
 ['Marina', 'UIC', 7],
 ['Justin', 'OSU', 5],
 ['Soohyun', 'UCSD', 2],
 ['Suraj', 'UCB', 2]],
    columns=['Name', 'School', 'Years']
)

schools = pd.DataFrame({
    'Abr': ['UCSD', 'UCLA', 'UCB', 'UIC'],
    'Full': ['University of California San Diego', 'University of California, Los Angeles', 'University of California, Berkeley', 'University of Illinois Chicago']
})

programs = pd.DataFrame({
    'uni': ['UCSD', 'UCSD', 'UCSD', 'UCB', 'OSU', 'OSU'],
    'dept': ['Math', 'HDSI', 'COGS', 'CS', 'Math', 'CS'],
    'grad_students': [205, 54, 281, 439, 304, 193]
})

dfs_side_by_side(profs, schools)

%%pt
profs.merge(schools, left_on='School', right_on='Abr', how='left')

dfs_side_by_side(profs, programs)

%%pt
profs.merge(programs, left_on='School', right_on='uni', how='inner')

df = profs.merge(programs, left_on='School', right_on='uni')
df.shape[0] == (____).sum()

dfs_side_by_side(profs, programs)

# Your code goes here.

cate_counts = (
    baby
    .merge(nyt, left_on='Name', right_on='nyt_name')
    .groupby(['category', 'Year'])
    ['Count']
    .sum()
    .reset_index()
)
cate_counts

# We'll talk about plotting code soon!
import plotly.express as px
fig = px.line(cate_counts, x='Year', y='Count',
              facet_col='category', facet_col_wrap=3,
              facet_row_spacing=0.15,
              width=600, height=400)
fig.update_yaxes(matches=None, showticklabels=False)

%%timeit
baby['Name'].apply(number_of_vowels)

%%timeit
res = []
for name in baby['Name']:
    res.append(number_of_vowels(name))

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

Phone Type	Stars for Dirty Birds	Stars for The Loft
Android	4.24	4.0
iPhone	2.99	2.79
All	3.32	3.37

	bill_length_mm	body_mass_g
island
Biscoe	38.98	3709.66
Dream	38.52	3701.36
Torgersen	39.04	3708.51

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

	nyt_name	category
0	Lucifer	forbidden
1	Lilith	forbidden
2	Danger	forbidden
...	...	...
20	Venus	celestial
21	Celestia	celestial
22	Skye	celestial

	Name	Sex	Count	Year
0	Noah	M	18407	2020
1	Julius	M	966	2020
2	Karen	F	330	2020
3	Noah	F	306	2020
4	Karen	M	6	2020

	Name	School	Years
0	Sam	UCB	5
1	Sam	UCSD	5
2	Janine	UCSD	8
3	Marina	UIC	7
4	Justin	OSU	5
5	Soohyun	UCSD	2
6	Suraj	UCB	2

	Abr	Full
0	UCSD	University of California San Diego
1	UCLA	University of California, Los Angeles
2	UCB	University of California, Berkeley
3	UIC	University of Illinois Chicago

	uni	dept	grad_students
0	UCSD	Math	205
1	UCSD	HDSI	54
2	UCSD	COGS	281
3	UCB	CS	439
4	OSU	Math	304
5	OSU	CS	193

	category	Year	Count
0	boomer	1880	292
1	boomer	1881	298
2	boomer	1882	326
...	...	...	...
659	mythology	2020	3516
660	mythology	2021	3895
661	mythology	2022	4049

Lisa	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

Bart	Units	Grade Points Earned
Fall	5	10.0
Winter	5	13.5
Spring	22	81.4

	Lisa's Quarter GPA	Bart's Quarter GPA
Fall	2.3	2.0
Winter	3.0	2.7
Spring	4.0	3.7

Lisa	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

Lecture 4 – Simpson's Paradox, Joining, and Transforming¶

DSC 80, Spring 2025¶

Announcements 📣¶

Agenda¶

Grouping with multiple columns¶

Grouping and indexes¶

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Distributions¶

Example: Palmer Penguins¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔 (Answer at dsc80.com/q)

Simpson's paradox¶

Example: Grades¶

Quarter-specific vs. overall GPAs¶

Question 🤔 (Answer at dsc80.com/q)

What happened?¶

Simpson's paradox¶

Example: How Berkeley was almost sued for gender discrimination (1973)¶

What happened?¶

Example: Restaurant reviews and phone types¶

Rule of thumb 👍¶

Takeaways¶

Really?¶

Further reading¶

Merging¶

Example: Name categories¶

Loading in the data¶

Merging¶

Example merge¶

The merge method¶

Join types: inner joins¶

Different join types¶

Different join types handle mismatches differently¶

Notes on the merge method¶

Lots of pandas operations do an implicit outer join!¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Question 🤔 (Answer at dsc80.com/q)

Returning back to our original question¶

Questions? 🤔

Other data representations¶

Representations of tabular data¶

DataFrames vs. spreadsheets¶

DataFrames vs. matrices¶

DataFrames vs. relations¶

Summary¶

Next time¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶

The `merge` method¶

Notes on the `merge` method¶

Lots of `pandas` operations do an implicit outer join!¶