from dsc80_utils import *

rest_path = Path('data') / 'restaurants.csv'
insp_path = Path('data') / 'inspections.csv'
viol_path = Path('data') / 'violations.csv'

rest = pd.read_csv(rest_path)
insp = pd.read_csv(insp_path)
viol = pd.read_csv(viol_path)

rest.head(2)

rest.columns

insp.head(2)

insp.columns

viol.head(2)

viol.columns

fig = px.histogram(insp['score'])
fig

scores = (
    insp[['grade', 'score']]
    .dropna()
    .groupby('grade')
    .mean()
    .reset_index()
)
# x= and y= are columns of scores. Convenient!
px.bar(scores, x='grade', y='score')

# Same as the above!
scores.plot(kind='bar', x='grade', y='score')

# Your code goes here.

# pandas stores these as ints, but they're actually nominal.
rest['business_id']

# pandas stores these as strings, but they're actually numeric.
rest['opened_date']

rest.sample(5)

insp['grade'].value_counts()

insp.info()

# Are there multiple restaurants with the same address?
rest['address'].value_counts()

# Keeps all rows with duplicate addresses.
(
    rest
    .groupby('address')
    .filter(lambda df: df.shape[0] >= 2)
    .sort_values('address')
)

# Does the same thing as above!
(
    rest[rest.duplicated(subset=['address'], keep=False)]
    .sort_values('address')
)

rest[['address', 'zip']]

insp[['score', 'grade']]

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'zip', 'opened_date']]

rest = (
    pd.read_csv(rest_path)
    .pipe(subset_rest)
)
rest

# Same as the above – but the above makes it easier to chain more .pipe calls afterwards.
subset_rest(pd.read_csv(rest_path))

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
)

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

viol = (
    pd.read_csv(viol_path)
    .pipe(subset_viol)
)

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df

insp[['score', 'grade']]

# The proportion of values in each column that are missing.
insp.isna().mean()

# Why are there null values here?
# insp['inspection_id'] and viol['inspection_id'] don't have any null values...
df[df['inspection_id'].isna()]

# Look at the dtype!
insp['date']

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'
pd.to_datetime(insp['date'], format=date_format)

# Another advantage of defining functions is that we can reuse this function
# for the 'opened_date' column in `rest` if we wanted to.
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# We should also remake df, since it depends on insp.
# Note that the new insp is used to create df!
df = merge_all_restaurant_data()

# Look at the dtype now!
df['date']

insp.resample('2W', on='date')['score'].mean()

# Where are those numbers coming from?
insp[
    (insp['date'] >= pd.Timestamp('2020-01-05')) &
    (insp['date'] < pd.Timestamp('2020-01-19'))
]['score']

(insp.resample('2W', on='date')
 .size()
 .plot(title='Number of Inspections Over Time')
)

insp['date']

insp['date'].dt.day

insp['date'].dt.dayofweek

dow_counts = insp['date'].dt.dayofweek.value_counts()
fig = px.bar(dow_counts)
fig.update_xaxes(tickvals=np.arange(7), ticktext=['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

Lecture 5 – Exploratory Data Analysis and Data Cleaning¶

DSC 80, Winter 2025¶

Announcements 📣¶

Agenda 📆¶

Dataset overview¶

San Diego food safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The data¶

Question 🤔 (Answer at dsc80.com/q)

Introduction to plotly¶

plotly¶

Using plotly¶

Initial plots¶

Exploratory data analysis and feature types¶

The data science lifecycle, revisited¶

Exploratory data analysis (EDA)¶

Different feature types¶

Question 🤔 (Answer at dsc80.com/q)

Feature types vs. data types¶

Data cleaning¶

Four pillars of data cleaning¶

Data cleaning: Data quality checks¶

Data quality checks¶

Scope¶

Measurements and values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using pipe¶

Combining the restaurant data¶

Question 🤔 (Answer at dsc80.com/q)

Data cleaning: Missing values¶

Missing values¶

Data cleaning: Transformations and timestamps¶

Transformations and timestamps¶

Creating timestamps¶

Working with timestamps¶

The .dt accessor¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Exploration¶

Question 🤔 (Answer at dsc80.com/q)

Example question: Can we rank restaurants by their number of violations? How about separately for each zip code?¶

Summary, next time¶

Summary¶

Next time¶

Introduction to `plotly`¶

`plotly`¶

Using `plotly`¶

💡 Pro-Tip: Using `pipe`¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶