Created for
import re
# the string to search with regex:
user_email = "prefix@domain.com"
# the regex to find if the userEmail contains '@' symbol:
regex = re.compile(r'@')
# do the match test:
if regex.search(user_email):
print("Match!")
else:
print("No match!")
. ^ $ * + ? { } [ ] \ | ( )
import re
phone_numbers = ['+359 88 7123 456', '+359 88 7123456' ]
# match numbers with format: +359 YY YXXX XXX
regex = r'\+359\s\d{2}\s\d{4}\s\d{3}'
for number in phone_numners:
if re.match(regex,number):
print("{} is a valid number format".format(number))
else:
print("{} is NOT IN A VALID FORMAT".format(number))
+359 88 7123 456 is a valid number format
+359 88 7123456 is NOT IN A VALID FORMAT
re
modulere
modulere
module in Python provides regular expression matching operations similar to those found in Perl.
print(len(r'\n')) #2
print(len(r'\')) #SyntaxError
import re
text = '\\stop'
re1 = '\\\\stop'
re2 = '\\stop'
re3 = r'\\stop'
if re.match(re1, text):
print("re1 matched!")
if re.match(re2, text):
# would not match, as '\' is a special character in regex and should be escaped, as well
print("re2 matched!")
if re.match(re3, text):
print("re3 matched!")
re.compile()
method
import re
text = "ABRACADABRA"
regex = re.compile(r'aca', re.I)
if regex.search(text):
print('Match')
import re
text = "123abc456"
rx = re.compile('abc')
res = rx.search(text) # will match
res = rx.search(text,3) # will match, 'a' is on index 3 in text
res = rx.search(text,4) # would NOT match
text = "123abc456"
rx = re.compile('abc')
res = rx.match(text)
res = rx.match(text) # will NOT match, 'abc' is not in the beginning
res = rx.match(text,3) # will match, as matching starts from index 3
text = "123abc456abcabc"
rx = re.compile('abc')
res = rx.findall(text) # ['abc', 'abc', 'abc']
res = rx.findall(\dtext) # ['3abc', '6abc']
Method/Attribute | Purpose |
---|---|
group() | Return the string matched by the RE |
groups() | Return a tuple containing all the subgroups of the match |
start() | Return the starting position of the match |
end() | Return the ending position of the match |
span() | Return a tuple containing the (start, end) positions of the match |
More methods: Match Object
text = "123abc456abc"
rx = re.compile('(\d+)(abc)')
res = rx.match(text)
if res:
print("res.group():", res.group()) #123abc
print("res.groups():", res.groups()) #('123', 'abc')
else:
print("No match!")
We will discuss capturing groups on next slides
Only next characters has special meaning in Regex:
^ $ \ . * + ? ( ) [ ] { } |
They can be combined with ordinary characters to change their meaning too
If we want to match literally a special character we have to escape it with backslash '\'
import re
text = "try to match: 2+3"
rx = re.compile('2\+3')
res = rx.search(text)
if res:
print( res.group())
Quantifier | Description |
---|---|
r * | r match 0 or more times |
r + | r match 1 or more times |
r ? | r match 0 or 1time |
r {n} | r match exactly n times |
r {n,m} | r match between n and m times (n, m are positive) |
r
can be any regex!
matched = re.search(r'a.*a','ala bala' );
print(matched)
# match='ala bala', but not 'ala'
?
'
matched = re.search(r'a.*?a','ala bala' );
print(matched)
#match='ala'
import re
string = 'ala bala'
matched = re.findall(r'a.*a',string ) # greedy
print(matched)
#OUTPUT: ['ala bala']
matched = re.findall(r'a.*?a',string ) # non-greedy
print(matched)
#OUTPUT: ['ala', 'ala']
matched = re.findall(r'.*?',string ) # non-gready
print(matched)
#OUTPUT: ['', '', '', '', '', '', '', '', ''
import re
matched = re.findall(r'\d{2,4}','123456789' ) # gready
print(matched)
# OUTPUT: ['1234', '5678']
matched = re.findall(r'\d{2,4}?','123456789' ) # non-gready
print(matched)
#OUTPUT: ['12', '34', '56', '78']
[abc]
(will match 'a' or 'b' or 'c').-
), when it is between 2 symbols, has special meaning inside the character class - it defines a range. Like: [0-9]
. If it is in the end, it is considered as a hyphen.Character set | Description |
---|---|
[abc] | Match any one of the symbols listed ('a' or 'b' or 'c') |
[a-z] | Match any symbol, from 'a' till 'z' (i.e. any lower Latin letter) |
[^abc] | Match any symbol, except 'a or 'b' or 'c' (i.e. the ^ negates the characters in the set) |
import re
# match any one of the vocals
matched = re.findall(r'[aeiouy]','astroid' );
print(matched)
#OUTPUT: ['a', 'o', 'i']
# match any consecutive vocals - one or more times
matched = re.findall(r'[aeiouy]+','astroid' );
print(matched)
#OUTPUT: ['a', 'oi']
# match bg mobile phone numbers
matched = re.findall('\+3598[7-9][0-9]{7}', '+359888123456');
print(matched)
#OUTPUT: ['+359888123456']
# match digit or hyphen:
matched = re.findall('[1-5-]', '12-34');
print(matched)
#OUTPUT: ['1', '2', '-', '3', '4']
import re
# match any non-vocal:
matched = re.findall(r'[^aeiouy]','astroid' );
print(matched)
#OUTPUT: ['s', 't', 'r', 'd']
Char class | Description |
---|---|
. | Match any character, except newline/line terminator. You can use the re.DOTALL/(?s) to match the new line, as well |
\w | Matches Unicode word characters; this includes most characters that can be part of a word in any language, as well as numbers and the underscore. If the ASCII flag is used, only [a-zA-Z0-9_] is matched |
\d | Matches any Unicode decimal digit, which includes [0-9], and also many other digit characters If the ASCII flag is used, only [0-9] is matched |
\s | Matches any Unicode whitespace characters (which includes [ \t\n\r\f\v], and also many other characters, |
import re
# match bg mobile phone numbers
matched = re.findall('\+3598[7-9]\d{7}', '+359888123456');
print(matched)
#OUTPUT: ['+359888123456']
import re
strings = ['petrov42','42petrov','ivan_pterov']
rx = re.compile('[a-z]\w+')
for string in strings:
matched = rx.search(string);
print("{} matched in {}".format(matched.group(),string) )
#OUTPUT:
#petrov42 matched in petrov42
#petrov matched in 42petrov
#ivan_pterov matched in ivan_pterov
string = """line1
line2
line3 line4"""
matched = re.findall('line\d\s', string);
print(matched)
#OUTPUT: ['line1\n', 'line2\n', 'line3 ']
re.I|re.M
sets both the I and M flags.In reges | As param | Description |
---|---|---|
(?i) | re.I | case-insensitive matching |
(?m) | re.M | multiline matching |
(?s) | re.S | Make the '.' to match any character at all, including a newline |
(?x) | re.X | Allows to write readable regexes by using spaces and comments('#') in the regex. More on: re.X |
import re
text = """123
ABC
456"""
rx = re.compile('(?is)123.abc')
res = rx.search(text)
if res:
print(res.group(0))
else:
print("No match!")
Anchor | Description |
---|---|
^ | Matches the beginning of the string (or the line, if m flag is used) |
$ | Matches the end of the string (or the line, if m flag is used) |
\b | Matches on word boundaries, i.e. between word(\w) and non-word(\W) characters. Note that the start and end of string are considered as non-word characters. |
\Z | Matches only at the end of the string. |
import re
strings = [
'',
'a',
'@',
'@a',
'aa',
'a!',
'a,a',
]
rx = re.compile(r'\b');
for string in strings:
res = rx.findall(string)
print("{} word bounders counted in {}".format(len(res), string))
#OUTPUT
#0 word bounders counted in
#2 word bounders counted in a
#0 word bounders counted in @
#2 word bounders counted in @a
#2 word bounders counted in aa
#2 word bounders counted in a!
#4 word bounders counted in a,a
strings = [
'ana',
'ana bel',
]
rx = re.compile(r'^a\w+a$');
for string in strings:
res = rx.findall(string)
print("{} matches in {}".format(len(res), string))
#OUTPUT:
#1 matches in ana
#0 matches in ana bel
Alternation | Description |
---|---|
r1|r2 | Matches if r1 OR r2 is matched |
/(r1|r2)r3/
=> match r1r3
OR r2r3
, but not r1r2r3
/(r1)r2/
=> match r1r2
and capture the part of the string that matched r1
(?:r1|r2)
=> match r1
or r2
but do not capture the match
import re
user = 'Ivan Ivanov: +359 887123456'
rx = re.compile("""(?x)
([A-Z]\w+)\s+ # capture first name
([A-Z]\w+):\s+ # capture sur name
\+(\d{3})\s # capture country code
(\d{6,8}) # capture number
""")
res = rx.search(user)
if res:
i = 0
for t in res.groups():
print("Capture {}: {}".format(i,t))
i+=1
#OUTPUT:
#Capture 0: Ivan
#Capture 1: Ivanov
#Capture 2: 359
#Capture 3: 88712345
import re
strings = [
'Icecream with strawberries?',
'Icecream with blueberries?',
'Icecream with raspberries?',
'Icecream with strawraspberries?',
'Icecream with berries?',
]
rx = re.compile(r'\b(?:straw|rasp)?berries');
for string in strings:
res = rx.search(string)
if res:
print('{} YES!'.format(string))
else:
print('{} NO!'.format(string))
#OUTPUT:
#Icecream with strawberries? YES!
#Icecream with blueberries? NO!
#Icecream with raspberries? YES!
#Icecream with strawraspberries? NO!
#Icecream with berries? YES!
These slides are based on
customised version of
framework