# You'll start seeing this cell in most lectures.
# It exists to hide all of the import statements and other setup
# code we need in lecture notebooks.

from dsc80_utils import *

dogs = pd.read_csv('data/dogs43.csv')
dogs.head(2)

whoa = np.random.choice([True, False], size=len(dogs))
(dogs[whoa]
 .groupby('size')
 .max()
 .get('longevity')
)

size
large     11.92
medium    12.92
small     16.50
Name: longevity, dtype: float64

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# The shape (10,) means that the array only has a single dimension,
# of size 10.
arr.shape

(10,)

2 ** arr

array([  1,   2,   4,   8,  16,  32,  64, 128, 256, 512])

(2 ** arr).sum()

np.int64(1023)

(2 ** arr).mean()

np.float64(102.3)

(2 ** arr).max()

np.int64(512)

(2 ** arr).argmax()

np.int64(9)

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

25.8 ms ± 388 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
squares = np.arange(1_000_000) ** 2

1.05 ms ± 43.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

nums = np.array([
    [5, 1, 9, 7],
    [9, 8, 2, 3],
    [2, 5, 0, 4]
])

nums

array([[5, 1, 9, 7],
       [9, 8, 2, 3],
       [2, 5, 0, 4]])

# nums has 3 rows and 4 columns.
nums.shape

(3, 4)

# Here, we're asking to reshape np.arange(1, 7)
# so that it has 2 rows and 3 columns.
a = np.arange(1, 7).reshape((2, 3))
a

array([[1, 2, 3],
       [4, 5, 6]])

a

array([[1, 2, 3],
       [4, 5, 6]])

a.sum(axis=0)

array([5, 7, 9])

a.sum(axis=1)

array([ 6, 15])

a

array([[1, 2, 3],
       [4, 5, 6]])

# Accesses row 0 and all columns.
a[0, :]

array([1, 2, 3])

# Same as the above.
a[0]

array([1, 2, 3])

# Accesses all rows and column 1.
a[:, 1]

array([2, 5])

# Accesses row 0 and columns 1 and onwards.
a[0, 1:]

array([2, 3])

s = (5, 3)
grid = np.ones(s) * 2 * np.arange(1, 16).reshape(s)
# grid[-1, 1:].sum()

from PIL import Image
img_path = Path('imgs') / 'bentley.jpg'
img = np.asarray(Image.open(img_path)) / 255

img

array([[[0.4 , 0.33, 0.24],
        [0.42, 0.35, 0.25],
        [0.43, 0.36, 0.26],
        ...,
        [0.5 , 0.44, 0.36],
        [0.51, 0.44, 0.36],
        [0.51, 0.44, 0.36]],

       [[0.39, 0.33, 0.23],
        [0.42, 0.36, 0.26],
        [0.44, 0.37, 0.27],
        ...,
        [0.51, 0.44, 0.36],
        [0.52, 0.45, 0.37],
        [0.52, 0.45, 0.38]],

       [[0.38, 0.31, 0.21],
        [0.41, 0.35, 0.24],
        [0.44, 0.37, 0.27],
        ...,
        [0.52, 0.45, 0.38],
        [0.53, 0.46, 0.39],
        [0.53, 0.47, 0.4 ]],

       ...,

       [[0.71, 0.64, 0.55],
        [0.71, 0.65, 0.55],
        [0.68, 0.62, 0.52],
        ...,
        [0.58, 0.49, 0.41],
        [0.56, 0.47, 0.39],
        [0.56, 0.47, 0.39]],

       [[0.5 , 0.44, 0.34],
        [0.42, 0.37, 0.26],
        [0.44, 0.38, 0.28],
        ...,
        [0.4 , 0.33, 0.25],
        [0.55, 0.48, 0.4 ],
        [0.58, 0.5 , 0.42]],

       [[0.38, 0.33, 0.22],
        [0.49, 0.44, 0.33],
        [0.56, 0.51, 0.4 ],
        ...,
        [0.15, 0.08, 0.  ],
        [0.28, 0.21, 0.13],
        [0.42, 0.35, 0.27]]])

img.shape

(200, 263, 3)

plt.imshow(img)
plt.axis('off');

mean_2d = img.mean(axis=2)
mean_2d

array([[0.32, 0.34, 0.35, ..., 0.43, 0.44, 0.44],
       [0.31, 0.35, 0.36, ..., 0.44, 0.45, 0.45],
       [0.3 , 0.33, 0.36, ..., 0.45, 0.46, 0.47],
       ...,
       [0.64, 0.64, 0.6 , ..., 0.49, 0.47, 0.47],
       [0.43, 0.35, 0.37, ..., 0.32, 0.48, 0.5 ],
       [0.31, 0.42, 0.49, ..., 0.07, 0.21, 0.34]])

plt.imshow(mean_2d)
plt.axis('off');

# np.newaxis is an alias for None.
# It helps us introduce an additional axis.
np.arange(5)[:, np.newaxis]

array([[0],
       [1],
       [2],
       [3],
       [4]])

np.repeat(np.arange(5)[:, np.newaxis], 3, axis=1)

array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4]])

mean_3d = np.repeat(mean_2d[:, :, np.newaxis], 3, axis=2)

plt.imshow(mean_3d)
plt.axis('off');

sepia_filter = np.array([
    [0.393, 0.769, 0.189],
    [0.349, 0.686, 0.168],
    [0.272, 0.534, 0.131]
])

# Multiplies each pixel by the sepia_filter matrix.
# Then, clips each RGB value to be between 0 and 1.
filtered = (img @ sepia_filter.T).clip(0, 1)
filtered

array([[[0.46, 0.41, 0.32],
        [0.48, 0.43, 0.33],
        [0.5 , 0.44, 0.35],
        ...,
        [0.6 , 0.53, 0.42],
        [0.6 , 0.54, 0.42],
        [0.61, 0.54, 0.42]],

       [[0.45, 0.4 , 0.31],
        [0.49, 0.43, 0.34],
        [0.5 , 0.45, 0.35],
        ...,
        [0.61, 0.54, 0.42],
        [0.62, 0.55, 0.43],
        [0.63, 0.56, 0.43]],

       [[0.43, 0.38, 0.3 ],
        [0.47, 0.42, 0.33],
        [0.51, 0.45, 0.35],
        ...,
        [0.63, 0.56, 0.44],
        [0.64, 0.57, 0.44],
        [0.64, 0.57, 0.45]],

       ...,

       [[0.88, 0.78, 0.61],
        [0.89, 0.79, 0.61],
        [0.84, 0.75, 0.58],
        ...,
        [0.68, 0.61, 0.47],
        [0.65, 0.58, 0.45],
        [0.65, 0.58, 0.45]],

       [[0.6 , 0.53, 0.42],
        [0.5 , 0.44, 0.35],
        [0.52, 0.46, 0.36],
        ...,
        [0.45, 0.4 , 0.31],
        [0.66, 0.59, 0.46],
        [0.69, 0.62, 0.48]],

       [[0.45, 0.4 , 0.31],
        [0.59, 0.53, 0.41],
        [0.69, 0.61, 0.48],
        ...,
        [0.12, 0.1 , 0.08],
        [0.3 , 0.26, 0.21],
        [0.48, 0.43, 0.33]]])

plt.imshow(filtered)
plt.axis('off');

import pandas as pd
import numpy as np

# You'll see the Path(...) / subpath syntax a lot.
# It creates the correct path to your file, 
# whether you're using Windows, macOS, or Linux.
dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs.head(3)

dogs.tail(2)

dogs.shape

(43, 7)

# The default index of a DataFrame is 0, 1, 2, 3, ...
dogs.index

RangeIndex(start=0, stop=43, step=1)

dogs.get('breed')

0                   Brittany
1              Cairn Terrier
2     English Cocker Spaniel
               ...          
40               Bullmastiff
41                   Mastiff
42             Saint Bernard
Name: breed, Length: 43, dtype: object

dogs.get(['breed', 'kind', 'longevity'])

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

# This sorts by 'height', 
# then breaks ties by 'longevity'.
# Note the difference in the last three rows between
# this DataFrame and the one above.
dogs.sort_values(['height', 'longevity'],
                 ascending=False)

dogs

dogs.set_index('breed')

# The above cell didn't involve an assignment statement,
# so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
dogs.shape

(43, 6)

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False),
           rows=43)

dogs

dogs.get('size')

breed
Brittany                  medium
Cairn Terrier              small
English Cocker Spaniel    medium
                           ...  
Bullmastiff                large
Mastiff                    large
Saint Bernard              large
Name: size, Length: 43, dtype: object

# This doesn't error, but sometimes we'd like it to.
dogs.get('size oops!')

dogs

# Returns a Series.
dogs['kind']

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/dsc80/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[85], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/dsc80/lib/python3.12/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File ~/miniforge3/envs/dsc80/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'breed'

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

dogs

# What are the unique kinds of dogs?
dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

# How many unique kinds of dogs are there?
dogs['kind'].nunique()

7

# What's the distribution of kinds?
dogs['kind'].value_counts()

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: count, dtype: int64

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

np.float64(11.340697674418605)

# Tell me more about the 'weight' column.
dogs['weight'].describe()

count     43.00
mean      49.35
std       39.42
          ...  
50%       36.50
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

# Gives us the index of the largest value, not the largest value itself.
dogs['lifetime_cost'].idxmax()

'Giant Schnauzer'

dogs.loc['Pug']

kind                 toy
lifetime_cost    18527.0
longevity           11.0
size              medium
weight              16.0
height              16.0
Name: Pug, dtype: object

# The first argument is the row label.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

np.float64(11.0)

type(dogs.loc)

pandas.core.indexing._LocIndexer

type(dogs.sort_values)

method

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

breed
Cocker Spaniel         small
Labrador Retriever    medium
Name: size, dtype: object

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], :]

# Shortcut for the line above.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

dogs.loc[dogs['weight'] < 10]

dogs.loc[dogs.index.str.contains('Retriever')]

# Because querying is so common, there's a shortcut:
dogs[dogs.index.str.contains('Retriever')]

# Empty DataFrame – not an error!
dogs.loc[dogs['kind'] == 'beaver']

dogs

# Series!
dogs.loc['Maltese']

dogs_reset = dogs.reset_index()
dogs_reset

# DataFrame!
dogs_reset[dogs_reset['breed'] == 'Maltese']

dogs

dogs[(dogs['weight'] < 20) & (dogs['kind'] == 'terrier')]

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

dogs

dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

# Finding the breed itself involves sorting, but not iloc.
dogs.sort_values('longevity', ascending=False).index[0]

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

# Finds the rows corresponding to the five cheapest to own breeds on a per-year basis.
(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

dogs.assign(**{'cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

dogs['lifetime_cost']

dogs['lifetime_cost'].to_numpy()

dogs

dogs.dtypes

dogs

# Gives the types as well as the space taken up by the DataFrame.
dogs.info()

dogs['lifetime_cost'] = dogs['lifetime_cost'].astype('uint32')

dogs.info()

dog_path

dogs = pd.read_csv(dog_path, dtype={'lifetime_cost': 'uint32'})
dogs

dogs.dtypes

dogs

# Max element in each column.
dogs.max()

# Max element in each row – throws an error since there are different types in each row.
# dogs.max(axis=1)

# The number of unique values in each column.
dogs.nunique()

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

all_dogs = pd.read_csv(Path('data') / 'all_dogs.csv')
all_dogs

# Your code goes here.

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0

	breed	kind	lifetime_cost	longevity	size	weight	height
41	Mastiff	working	13581.0	6.50	large	175.0	30.0
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	sporting	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Mastiff	working	13581.0	6.50	large	175.0	30.00
Saint Bernard	working	20022.0	7.78	large	155.0	26.50
Newfoundland	working	19351.0	9.32	large	125.0	27.00
Bullmastiff	working	13936.0	7.57	large	115.0	25.50
Bloodhound	hound	13824.0	6.75	large	85.0	25.00
Borzoi	hound	16176.0	9.08	large	82.5	28.00
Alaskan Malamute	working	21986.0	10.67	large	80.0	24.00
Rhodesian Ridgeback	hound	16530.0	9.10	large	77.5	25.50
Giant Schnauzer	working	26686.0	10.00	large	77.5	25.50
Clumber Spaniel	sporting	18084.0	10.00	medium	70.0	18.50
Labrador Retriever	sporting	21299.0	12.04	medium	67.5	23.00
Chesapeake Bay Retriever	sporting	16697.0	9.48	large	67.5	23.50
Irish Setter	sporting	20323.0	11.63	large	65.0	26.00
German Shorthaired Pointer	sporting	25842.0	11.46	large	62.5	24.00
Gordon Setter	sporting	19605.0	11.10	large	62.5	25.00
Bull Terrier	terrier	18490.0	10.21	medium	60.0	21.50
Golden Retriever	sporting	21447.0	12.04	medium	60.0	22.75
Pointer	sporting	24445.0	12.42	large	59.5	25.50
Afghan Hound	hound	24077.0	11.92	large	55.0	26.00
Siberian Husky	working	22049.0	12.58	medium	47.5	21.75
English Springer Spaniel	sporting	21946.0	12.54	medium	45.0	19.50
Kerry Blue Terrier	terrier	17240.0	9.40	medium	36.5	18.50
Brittany	sporting	22589.0	12.92	medium	35.0	19.00
Staffordshire Bull Terrier	terrier	21650.0	12.05	medium	31.0	15.00
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.00
Pembroke Welsh Corgi	herding	23978.0	12.25	small	26.0	11.00
Cocker Spaniel	sporting	24330.0	12.50	small	25.0	14.50
Tibetan Terrier	non-sporting	20336.0	12.31	small	24.0	15.50
Basenji	hound	22096.0	13.58	medium	23.0	16.50
Shetland Sheepdog	herding	21006.0	12.53	small	22.0	14.50
Dandie Dinmont Terrier	terrier	21633.0	12.17	small	21.0	9.00
Scottish Terrier	terrier	17525.0	10.69	small	20.0	10.00
Pug	toy	18527.0	11.00	medium	16.0	16.00
Miniature Schnauzer	terrier	20087.0	11.81	small	15.5	13.00
Cavalier King Charles Spaniel	toy	18639.0	11.29	small	15.5	12.50
Lhasa Apso	non-sporting	22031.0	13.92	small	15.0	10.50
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.00
Shih Tzu	toy	21152.0	13.20	small	12.5	9.75
Tibetan Spaniel	non-sporting	25549.0	14.42	small	12.0	10.00

Lecture 2 – DataFrame Fundamentals¶

DSC 80, Spring 2025¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer using `lec02-dog`)

numpy arrays¶

numpy overview¶

⚠️ The dangers of for-loops¶

Multi-dimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Question 🤔 (Answer at `lec02-grid`)

Example: Image processing¶

Applying a greyscale filter¶

Applying a sepia filter¶

Key takeaway: avoid for-loops whenever possible!¶

From babypandas to pandas 🐼¶

babypandas¶

pandas¶

pandas¶

pandas data structures¶

Importing pandas and related libraries¶

Example: Dog Breeds (woof!) 🐶¶

Review: head, tail, shape, index, get, and sort_values¶

Setting the index¶

Question 🤔 (Answer at `lec02-query`)

💡 Pro-Tip: Displaying more rows/columns¶

Selecting columns¶

Selecting columns in babypandas 👶🐼¶

Selecting columns with []¶

Useful Series methods¶

Selecting subsets of rows (and columns)¶

Use loc to slice rows and columns using labels¶

.loc is flexible 🧘¶

Review: Querying¶

Querying with multiple conditions¶

💡 Pro-Tip: Using .query¶

Question 🤔 (Answer at `lec02-index`)

Don't forget iloc!¶

More practice¶

Questions? 🤔

We will probably have to end lecture here.

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

💡 Pro-Tip: assign for column names with special characters¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

pandas and numpy¶

pandas is built upon numpy!¶

pandas data types¶

pandas data types¶

Type conversion¶

💡 Pro-Tip: Setting dtypes in read_csv¶

Axes¶

DataFrame methods with axis¶

Exercise

Summary, next time¶

Summary¶

`numpy` arrays¶

`numpy` overview¶

⚠️ The dangers of `for`-loops¶

Key takeaway: avoid `for`-loops whenever possible!¶

From `babypandas` to `pandas` 🐼¶

`babypandas`¶

`pandas`¶

`pandas` data structures¶

Importing `pandas` and related libraries¶

Review: `head`, `tail`, `shape`, `index`, `get`, and `sort_values`¶

Selecting columns in `babypandas` 👶🐼¶

Selecting columns with `[]`¶

Use `loc` to slice rows and columns using labels¶

`.loc` is flexible 🧘¶

💡 Pro-Tip: Using `.query`¶

Don't forget `iloc`!¶

💡 Pro-Tip: `assign` for column names with special characters¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

`pandas` data types¶

`pandas` data types¶

💡 Pro-Tip: Setting `dtype`s in `read_csv`¶

DataFrame methods with `axis`¶