?print-pdf
' Created for
import re
# regex that matches sequences of 8 or more word characters (letters, numbers, underscores)
rg = re.compile(r'^\w{8,}$')
# the string to be matched by regex:
password = 'abc123456'
m = rg.search(password)
if m:
print(f'{password} is valid!')
else:
print(f'{password} is not valid!')
# abc123456 is valid
. ^ $ * + ? { } [ ] \ | ( )
'\'
import re
text = "try to match: 2+3"
rx = re.compile('2\+3')
res = rx.search(text)
if res:
print( res.group())
[abc]
(will match 'a' or 'b' or 'c').-
), when it is between 2 symbols, has special meaning inside the character class - it defines a range. Like: [0-9]
. If it is in the end, it is considered as a hyphen.Character set | Description |
---|---|
[abc] | Match any one of the symbols listed ('a' or 'b' or 'c') |
[a-z] | Match any symbol, from 'a' till 'z' (i.e. any lower Latin letter) |
[0-9] | Match any digit |
[0-9-] | Match any digit or hyphen |
[^abc] | Match any symbol, except 'a or 'b' or 'c' (i.e. the ^ negates the characters in the set) |
import re
# Match any vowel character
matched = re.findall(r'[aeiouy]','astroid' );
print(matched)
#OUTPUT: ['a', 'o', 'i']
# Match any non-vowel character
matched = re.findall(r'[^aeiouy]','astroid' );
print(matched)
#OUTPUT: ['s', 't', 'r', 'd']
# match any digit or hyphen:
matched = re.findall('[0-9-]', 'a2-b8');
print(matched)
#OUTPUT: ['2', '-', '8']
\d
: Matches any digit character (equivalent to [0-9]
).\w
: Matches any word character (equivalent to [a-zA-Z0-9_]
).\s
: Matches any whitespace character (spaces, tabs, newlines, etc.).\D
: Matches any non-digit character (equivalent to [^0-9]
).\W
: Matches any non-word character (equivalent to [^a-zA-Z0-9_]
).\S
: Matches any non-whitespace character.import re
# Match any digit character
text = "The price is $25.99."
pattern = re.compile(r'\d')
result = pattern.findall(text)
print(result)
# Output: ['2', '5', '9', '9']
# Match any word character
text = "The quick brown fox jumps over the lazy dog."
pattern = re.compile(r'\w+')
result = pattern.findall(text)
print(result)
# Output: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
# Match 'line' followed by digit followed by whitespace:
text = """line1
line2
line3 line4"""
pattern = re.compile(r'line\d\s')
result = pattern.findall(text)
print(result)
# Output: ['line1\n', 'line2\n', 'line3 ']
Quantifier | Description |
---|---|
r * | r match 0 or more times |
r + | r match 1 or more times |
r ? | r match 0 or 1time |
r {n} | r match exactly n times |
r {n,m} | r match between n and m times (n, m are positive) |
r
can be character, group, or character class/set
pattern = re.compile(r'ab*c')
result = pattern.search('a') # Doesn't match
result = pattern.search('ac') # Matches
result = pattern.search('abc') # Matches
result = pattern.search('abbc') # Matches
pattern = re.compile(r'ab+c')
result = pattern.search('a') # Doesn't match
result = pattern.search('ac') # Doesn't match
result = pattern.search('abc') # Matches
result = pattern.search('abbc') # Matches
pattern = re.compile(r'ab?c')
result = pattern.search('a') # Doesn't match
result = pattern.search('ac') # Matches
result = pattern.search('abc') # Matches
result = pattern.search('abbc') # Doesn't match
n
occurrences of the preceding character or group.pattern = re.compile(r'ab{2}c')
pattern.search("abc") # Doesn't match
pattern.search("abbc") # Matches
pattern.search("abbbc") # Doesn't match
n
or more occurrences of the preceding character or group.pattern = re.compile(r'ab{2,}c')
pattern.search("abc") # Doesn't match
pattern.search("abbc") # Matches
pattern.search("abbbc") # Matches
n
and at most m
occurrences of the preceding character or group.pattern = re.compile(r'ab{2,3}c')
result = pattern.search("abc") # Doesn't match
result = pattern.search("abbc") # Matches
result = pattern.search("abbbc") # Matches
result = pattern.search("abbbbc") # Doesn't match
?
'import re
# Greedy *
matched = re.findall(r"a.*a", "ala bala")
print(matched)
# Output: ['ala bala']
# Non-greedy *
matched = re.findall(r"a.*?a", "ala bala")
print(matched)
# Output: ['ala', 'ala']
# Greedy {n,m}
matched = re.findall(r"\d{2,4}", "123456789")
print(matched)
# Output: ['1234', '5678']
# Non-greedy {n,m}
matched = re.findall(r"\d{2,4}?", "123456789")
print(matched)
# Output: ['12', '34', '56', '78']
Anchor | Description |
---|---|
^ | Matches the beginning of the string (or the line, if m flag is used) |
$ | Matches the end of the string (or the line, if m flag is used) |
\b | Matches on word boundaries, i.e. between word(\w) and non-word(\W) characters. Note that the start and end of string are considered as non-word characters. |
\Z | Matches only at the end of the string. |
^
and $
exampleimport re
strings = [
'ana',
'ana bel',
]
rx = re.compile(r'^a.+a$');
for string in strings:
res = rx.findall(string)
print("{} matches in {}".format(len(res), string))
#Output:
#1 matches in ana
#0 matches in ana bel
\b
exampleimport re
strings = [
'',
'a',
'@',
'@a',
'aa',
'a!',
'a,a',
]
rx = re.compile(r'\b');
for string in strings:
res = rx.findall(string)
print(f"{len(res)} word bounders counted in {string}")
# 0 word bounders counted in
# 2 word bounders counted in a
# 0 word bounders counted in @
# 2 word bounders counted in @a
# 2 word bounders counted in aa
# 2 word bounders counted in a!
# 4 word bounders counted in a,a
re.IGNORECASE
or with a short, one-letter form such as re.I
.re.I|re.M
sets both the I and M flags.regex_with_flags_arg = re.compile(r"[aeiouy]+",re.I)
m = re.search(r"[aeiouy]+", str, re.I)
regex_with_flags_prefix= re.compile(r"(?i)[aeiouy]+")
In reges | As param | Description |
---|---|---|
(?i) | re.I | case-insensitive matching |
(?m) | re.M | multiline matching |
(?s) | re.S | Make the '.' to match any character at all, including a newline |
(?x) | re.X | Allows to write readable regexes by using spaces and comments('#') in the regex. More on: re.X |
Reference: Compilation flags
import re
text = "The quick brown fox jumps over the lazy dog."
pattern = re.compile(r"the", flags=re.IGNORECASE)
result = pattern.findall(text)
# Output:
# ['The', 'the']
import re
# Example text containing multiple lines
text = """Line 1
# Line 2
Line 3"""
# Matching lines starting with "Line"
pattern = re.compile(r'^Line\s*\d', re.MULTILINE)
result = pattern.findall(text)
print(result)
# Output: ['Line 1', 'Line 3']
|
.import re
text = "I love cats. He love dogs."
pattern = re.compile(r"cat|dog")
result = pattern.findall(text)
print(result)
# Output: ['cat', 'dog']
import re
text = "I love cats. He love dogs."
pattern = re.compile(r"(?:cat|dog)s")
result = pattern.findall(text)
print(result)
import re
text = "Ivan Ivanov: 30 years old, Petar Petrov: 25 years old"
# Using capturing group to get names and ages
pattern = re.compile(r"(\w+ \w+): (\d+) years old")
matches = pattern.findall(text)
for match in matches:
print("Name:", match[0])
print("Age:", match[1])
/(?:r1|r2)r3/
=> match r1r3
OR r2r3
, but not r1r2r3
import re
# Example text
text = "I love strawberries and raspberries, but not blueberries."
# Using non-capturing group with alternation to match "quick" or "lazy"
pattern = re.compile(r"(?:straw|rasp)berries")
result = pattern.findall(text)
print(result)
# Output: ['strawberries', 'raspberries']
re
modulere
modulere
module in Python provides regular expression matching operations similar to those found in Perl.r''
) print('\test') # est
print(r'\test') #\test
print('\\test') #\test
re.compile()
methodimport re
text = "ABRACADABRA"
regex = re.compile(r'aca', re.I)
if regex.search(text):
print('Match')
import re
text = "123abc456"
rx = re.compile('abc')
res = rx.search(text) # will match
res = rx.search(text,3) # will match, 'a' is on index 3 in text
res = rx.search(text,4) # would NOT match
text = "123abc456"
rx = re.compile('abc')
res = rx.match(text)
res = rx.match(text) # will NOT match, 'abc' is not in the beginning
res = rx.match(text,3) # will match, as matching starts from index 3
text = "123abc456abcabc"
rx = re.compile('abc')
res = rx.findall(text) # ['abc', 'abc', 'abc']
res = rx.findall(\dtext) # ['3abc', '6abc']
re
module, like:match = re.search(pattern, string)
if match:
process(match)
Method/Attribute | Purpose |
---|---|
group() | Return the string matched by the RE |
groups() | Return a tuple containing all the subgroups of the match |
start() | Return the starting position of the match |
end() | Return the ending position of the match |
span() | Return a tuple containing the (start, end) positions of the match |
More methods: Match Object
import re
text = "Name: Maria, Age: 30"
pattern = r"Name: (\w+), Age: (\d+)" # Capture name and age
match = re.search(pattern, text)
if match:
print("Full match:", match.group(0))
print("Name (Group 1):", match.group(1))
print("Age (Group 2):", match.group(2))
print("All groups:", match.groups())
# OUTPUT
# Full match: Name: Maria, Age: 30
# Name (Group 1): Maria
# Age (Group 2): 30
# All groups: ('Maria', '30')
import re
def is_valid_bg_mobile_number(number):
""" Validates a Bulgarian mobile number.
Note:
The format of a valid Bulgarian mobile number is: +359 XX XDDD DDD,
where X is in [7, 8, 9], and D is in [0-9].
Args:
number (str): The mobile number to be validated.
Returns:
bool: True if the number is a valid Bulgarian mobile number, False otherwise.
"""
rg = re.compile(r'^\+359\s[7-9]{2}\s[7-9]\d{3}\s\d{3}$')
m = rg.match(number)
return True if m else False
if __name__=="__main__":
phone_numbers = [
'+359 88 7123 456', #yes
'+359 88 7123456', #no
'+359 88 1123 456', #no
'+359 87 9123 456' #yes
]
for number in phone_numbers:
if is_valid_bg_mobile_number(number):
print(f'{number:18} #yes')
else:
print(f'{number:18} #no')
import re
def is_valid_user_name(number):
""" Validates a User name.
Note:
User name must follow next rules:
1. Must consists of 3 to 10 characters inclusive.
2. Username can only contain alphanumeric characters, dashes (-) and underscores (_).
3. The first character of the username must be an alphabetic character
Args:
number (str): The user name to be validated.
Returns:
bool: True if the name is a valid, False otherwise.
"""
rg = re.compile(r'''
^ # beginning of string
[a-zA-Z] # rule 3
[\w-]{2,9} # rule 1 and 2
$ # end of string
''', re.VERBOSE)
m = rg.match(number)
return True if m else False
if __name__=="__main__":
user_names = [
"ada", # yes
"a__", # yes
"a12345", # yes
"a1234567890", # no (rule 1)
"1aaaaaaa", # no (rule 3)
"aaa#", # no (rule 2)
"a", # no (rule 1)
]
for user_name in user_names:
if is_valid_user_name(user_name):
print(f'{user_name:18} #yes')
else:
print(f'{user_name:18} #no')
These slides are based on
customised version of
framework