from dsc80_utils import *

salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2022.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' Xxxx'

salaries.head()

jobtitles = salaries['Job Title']
jobtitles.head()

0                         City Attorney
1                                 Mayor
2                Assistant Police Chief
3                       Police Sergeant
4    Assistant Retirement Administrator
Name: Job Title, dtype: object

jobtitles.shape[0], jobtitles.nunique()

(12831, 611)

jobtitles.value_counts().iloc[:10]

Job Title
Police Officer Ii             1082
Police Sergeant                311
Fire Fighter Ii                306
                              ... 
Associate Engineer - Civil     254
Fire Captain                   225
Program Manager                216
Name: count, Length: 10, dtype: int64

jobtitles.isna().sum()

np.int64(0)

# Uses character class negation.
jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True).sum()

np.int64(922)

jobtitles[jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True)].head()

137          Park & Recreation Director
248     Associate Engineer - Mechanical
734     Associate Engineer - Electrical
882        Associate Engineer - Traffic
1045         Associate Engineer - Civil
Name: Job Title, dtype: object

# Why are we converting to lowercase?
jobtitles.str.lower().str.contains('to').sum()

np.int64(1577)

jobtitles[jobtitles.str.lower().str.contains(' to ')]

1638              Assistant To The Chief Operating Officer
2183                  Principal Assistant To City Attorney
2238                             Assistant To The Director
                               ...                        
6594     Confidential Secretary To Chief Operating Officer
6832                       Confidential Secretary To Mayor
11028                      Confidential Secretary To Mayor
Name: Job Title, Length: 10, dtype: object

jobtitles.str.lower().str.contains(r'\bto\b', regex=True).sum()

np.int64(10)

jobtitles[jobtitles.str.lower().str.contains(r'\bto\b', regex=True)]

1638              Assistant To The Chief Operating Officer
2183                  Principal Assistant To City Attorney
2238                             Assistant To The Director
                               ...                        
6594     Confidential Secretary To Chief Operating Officer
6832                       Confidential Secretary To Mayor
11028                      Confidential Secretary To Mayor
Name: Job Title, Length: 10, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bthe\b', regex=True)]

1638    Assistant To The Chief Operating Officer
2238                   Assistant To The Director
5609                   Assistant To The Director
6544                   Assistant To The Director
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bfor\b', regex=True)]

3449     Assistant For Community Outreach
6889     Assistant For Community Outreach
10810    Assistant For Community Outreach
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bi+v?\b', regex=True)]

5                   Police Officer Ii
10                  Police Officer Ii
48       Fire Prevention Inspector Ii
                     ...             
12822           Clerical Assistant Ii
12828               Police Officer Ii
12830                Police Officer I
Name: Job Title, Length: 6087, dtype: object

jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto\b|\bthe\b\|bfor\b', '', regex=True)
    .str.replace(r'[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(r'\bi+v?\b', '', regex=True)
    .str.replace(r' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space.
    .str.strip()                                       # Removes leading/trailing spaces if present.
)

jobtitles.sample(5)

262                         police sergeant
8658                   police records clerk
6613                       plant technician
352                            fire captain
7860    collections investigator supervisor
Name: Job Title, dtype: object

(jobtitles == 'police officer').sum()

np.int64(1378)

jobtitles.str.split()

0                  [city, attorney]
1                           [mayor]
2        [assistant, police, chief]
                    ...            
12828             [police, officer]
12829          [police, dispatcher]
12830             [police, officer]
Name: Job Title, Length: 12831, dtype: object

# The .explode method concats the lists together.
all_words = jobtitles.str.split().explode()
all_words

0              city
0          attorney
1             mayor
            ...    
12829    dispatcher
12830        police
12830       officer
Name: Job Title, Length: 30064, dtype: object

unique_words = all_words.value_counts()
unique_words

Job Title
police          2299
officer         1608
assistant       1267
                ... 
gardener           1
governmental       1
dna                1
Name: count, Length: 332, dtype: int64

# Created using a dictionary to avoid a "DataFrame is highly fragmented" warning.
counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(jobtitles)

counts_df.head()

counts_df.shape

(12831, 332)

dfc = counts_df.loc['deputy fire chief'].iloc[0]
dfc

police          0
officer         0
assistant       0
               ..
gardener        0
governmental    0
dna             0
Name: deputy fire chief, Length: 332, dtype: int64

fbc = counts_df.loc['fire battalion chief'].iloc[0]
fbc

police          0
officer         0
assistant       0
               ..
gardener        0
governmental    0
dna             0
Name: fire battalion chief, Length: 332, dtype: int64

pair_counts = (
    pd.concat([dfc, fbc], axis=1)
    .sort_values(by=['deputy fire chief', 'fire battalion chief'], ascending=False)
    .head(10)
    .T
)

pair_counts

np.dot(pair_counts.iloc[0], pair_counts.iloc[1])

np.int64(2)

sentences = pd.Series([
    'I really really want global peace',
    'I must enjoy global warming',
    'We must solve climate change'
])

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

unique_words = sentences.str.split().explode().value_counts()
unique_words

I          2
really     2
global     2
          ..
solve      1
climate    1
change     1
Name: count, Length: 12, dtype: int64

counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = sentences.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(sentences)

counts_df

counts_df

def sim_pair(s1, s2):
    return np.dot(s1, s2) / (np.linalg.norm(s1) * np.linalg.norm(s2))

# Look at the documentation of the .corr method to see how this works!
counts_df.T.corr(sim_pair)

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

tf = sentences.iloc[1].count('global') / len(sentences.iloc[1].split())
tf

0.2

idf = np.log(len(sentences) / sentences.str.contains('global').sum())
idf

np.float64(0.4054651081081644)

tf * idf

np.float64(0.08109302162163289)

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

unique_words = np.unique(sentences.str.split().explode())
unique_words

array(['I', 'We', 'change', 'climate', 'enjoy', 'global', 'must', 'peace',
       'really', 'solve', 'want', 'warming'], dtype=object)

tfidf_dict = {}

for word in unique_words:
    re_pat = fr'\b{word}\b'
    tf = sentences.str.count(re_pat) / sentences.str.split().str.len()
    idf = np.log(len(sentences) / sentences.str.contains(re_pat).sum())
    tfidf_dict[word] = tf * idf
    
tfidf = pd.DataFrame(tfidf_dict).set_index(sentences)

tfidf

display_df(tfidf, cols=12)

display_df(tfidf, cols=12)

tfidf.idxmax(axis=1)

I really really want global peace    really
I must enjoy global warming           enjoy
We must solve climate change             We
dtype: object

from IPython.display import YouTubeVideo
YouTubeVideo('gzcBTUvVp7M')

from pathlib import Path
sotu_txt = Path('data') / 'stateoftheunion1790-2023.txt'
sotu = sotu_txt.read_text()

len(sotu)

10577941

print(sotu[:1600])

The Project Gutenberg EBook of Complete State of the Union Addresses,
from 1790 to the Present. Speeches beginning in 2002 are from UCSB The American Presidency Project.
Speeches from 2018-2023 were manually downloaded from whitehouse.gov.

Character set encoding: UTF8

The addresses are separated by three asterisks


CONTENTS

  George Washington, State of the Union Address, January 8, 1790
  George Washington, State of the Union Address, December 8, 1790
  George Washington, State of the Union Address, October 25, 1791
  George Washington, State of the Union Address, November 6, 1792
  George Washington, State of the Union Address, December 3, 1793
  George Washington, State of the Union Address, November 19, 1794
  George Washington, State of the Union Address, December 8, 1795
  George Washington, State of the Union Address, December 7, 1796
  John Adams, State of the Union Address, November 22, 1797
  John Adams, State of the Union Address, December 8, 1798
  John Adams, State of the Union Address, December 3, 1799
  John Adams, State of the Union Address, November 11, 1800
  Thomas Jefferson, State of the Union Address, December 8, 1801
  Thomas Jefferson, State of the Union Address, December 15, 1802
  Thomas Jefferson, State of the Union Address, October 17, 1803
  Thomas Jefferson, State of the Union Address, November 8, 1804
  Thomas Jefferson, State of the Union Address, December 3, 1805
  Thomas Jefferson, State of the Union Address, December 2, 1806
  Thomas Jefferson, State of the Union Address, October 27, 1807
  Thomas Jefferson, State of the Union Address,

speeches = sotu.split('\n***\n')[1:]

len(speeches)

233

print(speeches[-1][:1000])

State of the Union Address
Joseph R. Biden Jr.  
February 7, 2023

  Mr. Speaker. Madam Vice President. Our First Lady and Second
Gentleman. Members of Congress and the Cabinet. Leaders of our
military. Mr. Chief Justice, Associate Justices, and retired Justices
of the Supreme Court. And you, my fellow Americans.
  I start tonight by congratulating the members of the 118th Congress
and the new Speaker of the House, Kevin McCarthy. Mr. Speaker, I look
forward to working together.
  I also want to congratulate the new leader of the House Democrats and
the first Black House Minority Leader in history, Hakeem Jeffries.
  Congratulations to the longest serving Senate leader in history,
Mitch McConnell.
  And congratulations to Chuck Schumer for another term as Senate
Majority Leader, this time with an even bigger majority.
  And I want to give special recognition to someone who I think will be
considered the greatest Speaker in the history of this country, Nancy
Pelosi.
  The story of Amer

import re
def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))

speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
speeches_df

speeches_df

unique_words = speeches_df['contents'].str.split().explode().value_counts()
# Take the top 500 most common words for speed
unique_words = unique_words.iloc[:500].index
unique_words

Index(['the', 'of', 'to', 'and', 'in', 'a', 'that', 'for', 'be', 'our',
       ...
       'desire', 'call', 'submitted', 'increasing', 'months', 'point', 'trust',
       'throughout', 'set', 'object'],
      dtype='object', name='contents', length=500)

from tqdm.notebook import tqdm

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

# Wrap the sequence with `tqdm()` to display a progress bar
for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf = pd.DataFrame(tfidf_dict)
tfidf.head()

summaries = tfidf.idxmax(axis=1)
summaries

0          object
1      convention
2       provision
          ...    
230          it's
231       tonight
232          it's
Length: 233, dtype: object

def five_largest(row):
    return ', '.join(row.index[row.argsort()][-5:])

keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)

keywords_df

display_df(keywords_df, rows=233)

tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl

tfidf_nl = pd.DataFrame(tfidf_nl_dict)
tfidf_nl.head()

keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)
keywords_nl_df

(1000 / 999)

np.log(1000 / 999)

(50 / 2)

(500 / 2)

np.log(50 / 2)

np.log(500 / 2)

	Employee Name	Job Title	Base Pay	Overtime Pay	...	Year	Notes	Agency	Status
0	Mara Xxxx	City Attorney	227441.53	0.00	...	2022	NaN	San Diego	FT
1	Todd Xxxx	Mayor	227441.53	0.00	...	2022	NaN	San Diego	FT
2	Terence Xxxx	Assistant Police Chief	227224.32	0.00	...	2022	NaN	San Diego	FT
3	Esmeralda Xxxx	Police Sergeant	124604.40	162506.54	...	2022	NaN	San Diego	FT
4	Marcelle Xxxx	Assistant Retirement Administrator	279868.04	0.00	...	2022	NaN	San Diego	FT

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

	big	data	science
big big big big data	4	1	0
big data science	1	1	1
science big data	1	1	1

Pair	Dot Product	Cosine Similarity
big data science and big big big big data	5	0.7001
big data science and science big data	3	1

	I	We	change	climate	...	really	solve	want	warming
I really really want global peace	0.07	0.00	0.00	0.00	...	0.37	0.00	0.18	0.00
I must enjoy global warming	0.08	0.00	0.00	0.00	...	0.00	0.00	0.00	0.22
We must solve climate change	0.00	0.22	0.22	0.22	...	0.00	0.22	0.00	0.00

	I really really want global peace	I must enjoy global warming	We must solve climate change
I really really want global peace	1.00	0.32	0.0
I must enjoy global warming	0.32	1.00	0.2
We must solve climate change	0.00	0.20	1.0

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	...	trust	throughout	set	object
0	...	4.29e-04	0.00e+00	0.00e+00	2.04e-03
1	...	0.00e+00	0.00e+00	0.00e+00	1.06e-03
2	...	4.06e-04	0.00e+00	3.48e-04	6.44e-04
3	...	6.70e-04	2.17e-04	0.00e+00	7.09e-04
4	...	2.38e-04	4.62e-04	0.00e+00	3.77e-04

	president	date	0
0	George Washington	January 8, 1790	your, proper, regard, ought, object
1	George Washington	December 8, 1790	case, established, object, commerce, convention
2	George Washington	October 25, 1791	community, upon, lands, proper, provision
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	get, americans, percent, jobs, it's
231	Joseph R. Biden Jr.	March 1, 2022	let, jobs, americans, get, tonight
232	Joseph R. Biden Jr.	February 7, 2023	down, percent, jobs, tonight, it's

	president	date	0
0	George Washington	January 8, 1790	your, proper, regard, ought, object
1	George Washington	December 8, 1790	case, established, object, commerce, convention
2	George Washington	October 25, 1791	community, upon, lands, proper, provision
3	George Washington	November 6, 1792	subject, upon, information, proper, provision
4	George Washington	December 3, 1793	having, vessels, executive, shall, ought
5	George Washington	November 19, 1794	too, army, let, ought, constitution
6	George Washington	December 8, 1795	army, prevent, object, provision, treaty
7	George Washington	December 7, 1796	republic, treaty, britain, ought, object
8	John Adams	November 22, 1797	spain, british, claims, treaty, vessels
9	John Adams	December 8, 1798	st, minister, treaty, spain, commerce
10	John Adams	December 3, 1799	civil, period, british, minister, treaty
11	John Adams	November 11, 1800	experience, protection, navy, commerce, ought
12	Thomas Jefferson	December 8, 1801	consideration, shall, object, vessels, subject
13	Thomas Jefferson	December 15, 1802	shall, debt, naval, duties, vessels
14	Thomas Jefferson	October 17, 1803	debt, vessels, sum, millions, friendly
15	Thomas Jefferson	November 8, 1804	received, convention, having, due, friendly
16	Thomas Jefferson	December 3, 1805	families, convention, sum, millions, vessels
17	Thomas Jefferson	December 2, 1806	due, consideration, millions, shall, spain
18	Thomas Jefferson	October 27, 1807	whether, army, british, vessels, shall
19	Thomas Jefferson	November 8, 1808	thus, british, millions, commerce, her
20	James Madison	November 29, 1809	cases, having, due, british, minister
21	James Madison	December 5, 1810	provisions, view, minister, commerce, british
22	James Madison	November 5, 1811	britain, provisions, commerce, minister, british
23	James Madison	November 4, 1812	nor, subject, provisions, britain, british
24	James Madison	December 7, 1813	number, having, naval, britain, british
25	James Madison	September 20, 1814	naval, vessels, britain, his, british
26	James Madison	December 5, 1815	debt, treasury, millions, establishment, sum
27	James Madison	December 3, 1816	constitution, annual, sum, treasury, british
28	James Monroe	December 12, 1817	improvement, territory, indian, millions, lands
29	James Monroe	November 16, 1818	minister, object, territory, her, spain
30	James Monroe	December 7, 1819	parties, friendly, minister, treaty, spain
31	James Monroe	November 14, 1820	amount, minister, extent, vessels, spain
32	James Monroe	December 3, 1821	powers, duties, revenue, spain, vessels
33	James Monroe	December 3, 1822	object, proper, vessels, spain, convention
34	James Monroe	December 2, 1823	th, department, object, minister, spain
35	James Monroe	December 7, 1824	spain, governments, convention, parties, object
36	John Quincy Adams	December 6, 1825	officers, commerce, condition, upon, improvement
37	John Quincy Adams	December 5, 1826	commercial, upon, vessels, british, duties
38	John Quincy Adams	December 4, 1827	lands, british, receipts, upon, th
39	John Quincy Adams	December 2, 1828	duties, revenue, upon, commercial, britain

Lecture 12 – Text Features¶

DSC 80, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at dsc80.com/q)

Text features¶

Review: Regression and features¶

Moving forward¶

Text features¶

Example: San Diego employee salaries¶

Aside on privacy and ethics¶

Goal: Quantifying similarity¶

Exploring job titles¶

Canonicalization¶

Punctuation¶

"Glue" words¶

Roman numerals (e.g. "Ii")¶

Fixing punctuation and removing "glue" words and roman numerals¶

Bag of words 💰¶

Text similarity¶

A counts matrix¶

Creating a counts matrix¶

Bag of words¶

Cosine similarity¶

Question: What job titles are most similar to 'deputy fire chief'?¶

Counting shared words¶

Recall: The dot product¶

Cosine similarity and bag of words¶

Normalizing¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

Question 🤔 (Answer at dsc80.com/q)

TF-IDF¶

The importance of words¶

Term frequency¶

Inverse document frequency¶

Intuition¶

Term frequency-inverse document frequency¶

Computing TF-IDF¶

TF-IDF of all words in all documents¶

Interpreting TF-IDFs¶

Example: State of the Union addresses 🎤¶

State of the Union addresses¶

The data¶

Finding the most important words in each speech¶

💡 Pro-Tip: Using tqdm¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Next time¶

Question: What job titles are most similar to `'deputy fire chief'`?¶

💡 Pro-Tip: Using `tqdm`¶