from dsc80_utils import *

contact = '''
Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.
'''

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

def is_possibly_area_code(s):
    '''Does `s` look like (678)?'''
    return s.startswith('(') and s.endswith(')') and s[1:4].isnumeric()

is_possibly_area_code('(123)')

True

is_possibly_area_code('(99)')

False

def is_last_7_phone_number(s):
    '''Does `s` look like 999-8212?'''
    s1, s2 = s.split('-')
    return s1.isnumeric() and s2.isnumeric() and len(s1)==3 and len(s2)==4

is_last_7_phone_number('999-8212')

True

is_last_7_phone_number('534 1100')

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[12], line 1
----> 1 is_last_7_phone_number('534 1100')

Cell In[8], line 3, in is_last_7_phone_number(s)
      1 def is_last_7_phone_number(s):
      2     '''Does `s` look like 999-8212?'''
----> 3     s1, s2 = s.split('-')
      4     return s1.isnumeric() and s2.isnumeric() and len(s1)==3 and len(s2)==4

ValueError: not enough values to unpack (expected 2, got 1)

# Removes punctuation from the end of each string.
pieces = [s.rstrip('.,?;"\'') for s in contact.split()]

for i in range(len(pieces) - 1):
    if is_possibly_area_code(pieces[i]):
        if is_last_7_phone_number(pieces[i+1]):
            print(pieces[i], pieces[i+1])

(800) 867-5309
(800) 123-4567

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[13], line 6
      4 for i in range(len(pieces) - 1):
      5     if is_possibly_area_code(pieces[i]):
----> 6         if is_last_7_phone_number(pieces[i+1]):
      7             print(pieces[i], pieces[i+1])

Cell In[8], line 3, in is_last_7_phone_number(s)
      1 def is_last_7_phone_number(s):
      2     '''Does `s` look like 999-8212?'''
----> 3     s1, s2 = s.split('-')
      4     return s1.isnumeric() and s2.isnumeric() and len(s1)==3 and len(s2)==4

ValueError: not enough values to unpack (expected 2, got 1)

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

import re
re.findall(r'\(\d{3}\) \d{3}-\d{4}', contact)

['(800) 867-5309', '(800) 123-4567']

import re

re.search('AB*A', 
          'here is a string for you: ABBBA. here is another: ABBBBBBBA')

<re.Match object; span=(26, 31), match='ABBBA'>

re.findall('AB*A', 
           'here is a string for you: ABBBA. here is another: ABBBBBBBA')

['ABBBA', 'ABBBBBBBA']

re.sub('AB*A', 
       'billy', 
       'here is a string for you: ABBBA. here is another: ABBBBBBBA')

'here is a string for you: billy. here is another: billy'

re.findall('\bcat\b', 'my cat is hungry')

[]

re.findall(r'\bcat\b', 'my cat is hungry')

['cat']

# Huh?
print('\bcat\b')

ca

re.findall(r'\w+@(\w+)\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['notucsd', 'ucsd']

re.findall(r'\w+@\w+\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['billy@notucsd.edu', 'notbilly@ucsd.edu']

# A regex that matches strings with two of the same vowel followed by 3 digits
# We only want to capture the digits, but...
re.findall(r'(aa|ee|ii|oo|uu)(\d{3})', 'eeoo124')

[('oo', '124')]

s = '''132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''

exp = r'\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

new_exp = r'\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

other_s

re.findall(new_exp, other_s)

operation	order of op.	example	matches ✅	does not match ❌
concatenation	3	`AABAAB`	`'AABAAB'`	every other string
or	4	`AA\\|BAAB`	`'AA'`, `'BAAB'`	every other string
closure (zero or more)	2	`AB*A`	`'AA'`, `'ABBBBBBA'`	`'AB'`, `'ABABA'`
parentheses	1	`A(A\\|B)AAB` `(AB)*A`	`'AAAAB'`, `'ABAAB'` `'A'`, `'ABABABABA'`	every other string `'AA'`, `'ABBA'`

operation	example	matches ✅	does not match ❌
wildcard	`.U.U.U.`	`'CUMULUS'` `'JUGULUM'`	`'SUCCUBUS'` `'TUMULTUOUS'`
character class	`[A-Za-z][a-z]*`	`'word'` `'Capitalized'`	`'camelCase'` `'4illegal'`
at least one	`bi(ll)+y`	`'billy'` `'billlllly'`	`'biy'` `'bily'`
between $i$ and $j$ occurrences	`m[aeiou]{1,2}m`	`'mem'` `'maam'` `'miem'`	`'mm'` `'mooom'` `'meme'`

operation	example	matches ✅	does not match ❌
escape character	`ucsd\.edu`	`'ucsd.edu'`	`'ucsd!edu'`
beginning of line	`^ark`	`'ark two'` `'ark o ark'`	`'dark'`
end of line	`ark$`	`'dark'` `'ark o ark'`	`'ark two'`
zero or one	`cat?`	`'ca'` `'cat'`	`'cart'` (matches `'ca'` only)
built-in character classes*	`\w+` `\d+`	`'billy'` `'231231'`	`'this person'` `'858 people'`
character class negation	`[^a-z]+`	`'KINGTRITON551'` `'1721$$'`	`'porch'` `'billy.edu'`

Lecture 11 – Regular Expressions¶

DSC 80, Spring 2025¶

Agenda 📆¶

Motivation¶

Who called? 📞¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Intermediate regex¶

More regex syntax¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Even more regex syntax¶

Question 🤔 (Answer at dsc80.com/q)

Regex in Python¶

`re` in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Question 🤔 (Answer at dsc80.com/q)

Limitations of regular expressions¶

Summary, next time¶

Summary¶

Next time¶

Lecture 11 – Regular Expressions¶

DSC 80, Spring 2025¶

Agenda 📆¶

Motivation¶

Who called? 📞¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Intermediate regex¶

More regex syntax¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Even more regex syntax¶

Question 🤔 (Answer at dsc80.com/q)

Regex in Python¶

re in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Question 🤔 (Answer at dsc80.com/q)

Limitations of regular expressions¶

Summary, next time¶

Summary¶

Next time¶

`re` in Python¶