Regular expression review

@[TOC] table of contents
according to Rookie tutorial Add a few personal understanding and finishing, which can be used to review the knowledge points. For details, please refer to the original link

1. re.match function re match(pattern,string,flags=0)

re.match only matches the beginning of the string. If the beginning of the string does not match the regular expression, the matching fails and the function returns None;

And re Search matches the entire string until a match is found.

import re
print(re.match('www','www.baidu.com').span()) # (0, 3)
print(re.match('www','www.baidu.com')) # <re.Match object; span=(0, 3), match='www'>
print(re.match('com','www.baidu.com')) # Not at the starting position. Can't match None

The string of the entire expression matched by the matching object method group(num=0),

group() can enter more than one group number at a time, in which case it will return a tuple containing the values corresponding to those groups.

groups() returns a tuple containing all group strings, from 1 to the group number contained.

line = "Cats are smarter than dogs"

matchObject = re.match(r'(.*) are (.*?) .*',line,re.M|re.I)

The previous r indicates that the string is a non escaped original string, which allows the compiler to ignore backslashes, that is, escape characters.

But there is no backslash in this string, so this r is optional.

if matchObject:
    print('matchObject.group():',matchObject.group()) # matchObject.group(): Cats are smarter than dogs
    print('matchObject.group(1):',matchObject.group(1)) # matchObject.group(1): Cats
    print('matchObject.group(2):',matchObject.group(2)) # matchObject.group(2): smarter
    print('matchObject.groups():',matchObject.groups()) # matchObject.groups(): ('Cats', 'smarter')
else:
    print('no match!!')

2. re.search(pattern,string,flags=0)

re.search scans the entire string and returns the first successful match.

print(re.search('www','www.baidu.com').span()) # (0, 3)
print(re.search('www','www.baidu.com')) # <re.Match object; span=(0, 3), match='www'>
print(re.search('com','www.baidu.com').span()) # (10, 13)
print(re.search('com','www.baidu.com')) # <re.Match object; span=(10, 13), match='com'>

line = "Cats are smarter than dogs"

searchObject = re.search(r'(.*) are (.*?) .*',line,re.M|re.I)

if matchObject:
    print('searchObject.group():',searchObject.group()) # searchObject.group(): Cats are smarter than dogs
    print('searchObject.group(1):',searchObject.group(1)) # searchObject.group(1): Cats
    print('searchObject.group(2):',searchObject.group(2)) # searchObject.group(2): smarter
    print('searchObject.groups():',searchObject.groups()) # searchObject.groups(): ('Cats', 'smarter')
else:
    print('nothing found!!')

3. re.sub(pattern, repl, string, count=0, flags=0)

Replace matches in string

"""
pattern : Pattern string in regular.
repl : The replaced string can also be a function.
string : The original string to be found and replaced.
count : The maximum number of times to replace after pattern matching. By default, 0 means to replace all matches.
"""

import re
phone = "2004-959-559 # This is a foreign telephone number“

Delete non numeric (-) strings

num = re.sub(r'\D',"",phone)
print('The phone number is:',num) # The telephone number is 2004959559

Remove Python comments from the string (note that the first method has a space after the phone number)

num = re.sub(r'#.*$',"",phone)
print('The phone number is:',num) # The telephone number is 2004-959-559

num = re.sub(r' #.*$',"",phone)
print('The phone number is:',num) # The telephone number is 2004-959-559

The repl parameter is a function

Multiply the matching number by 2

import re
def double(matched):
    value = int(matched.group('name'))
    return str(value * 2)
s = 'A23G4HFD567'
print(re.sub('(?P<name>\d+)',double,s)) # A46G8HFD1134
# ? P < name > means to name a group named, and the matching rule conforms to the following / d+

4.re.compile # (my understanding: another form of matching. First compile the regular expression, and then match it through match() or search())

The compile function is used to compile regular expressions and generate a regular expression (Pattern) object,

Key: used by match() and search().

re.compile(pattern[, flags])

"""
pattern : A regular expression in string form

flags : Optional, indicating the matching mode, such as ignoring case, multiline mode, etc. the specific parameters are:

re.I ignore case
re.L Represents a special character set \w, \W, \b, \B, \s, \S Dependent on current environment
re.M Multiline mode
re.S mean . And any character including a newline character(. (excluding line breaks)
re.U Represents a special character set \w, \W, \b, \B, \d, \D, \s, \S Depend on Unicode Character attribute database
re.X To increase readability, spaces and are ignored # Later notes
"""

import re
pattern = re.compile(r'\d+') # Used to match at least one number
m = pattern.match('one12twothree34four')

print(m) # None find header does not match
m = pattern.match('one12twothree34four',2,10)

print(m) # None matches from the position of 'e', there is no match
m = pattern.match('one12twothree34four',3,10)

print(m) # <re.Match object; span=(3, 5), match='12'>
         # Start matching from the position of '1', and it just matches

m = pattern.search('one12twothree34four')
print(m) #<re.Match object; span=(3, 5), match='12'>
print(m.group(0),m.start(0),m.end(0),m.span(0)) # 12 3 5 (3, 5) 0 in brackets can be omitted

#Example 2

import re
pattern = re.compile(r'([a-z]+) ([a-z]+)',re.I) # re.I means ignore case
m = pattern.match('Hello World Wide Web')
m2 = pattern.search('Hello World Wide Web')
print(m) # <re.Match object; span=(0, 11), match='Hello World'>
print(m2) # <re.Match object; span=(0, 11), match='Hello World'>

print(m.group(),m.span(),m.group(1),m.span(1),m.group(2),m.span(2))
# Hello World # (0, 11) # Hello # (0, 5) # World # (6, 11)
print(m.groups()) # ('Hello', 'World')
#print(m.group(3)) # An error is reported. There is no third group

5.findall

Find all substrings matched by the regular expression in the string and return a list,

If there are multiple matching patterns, the tuple list is returned. If no matching pattern is found, the empty list is returned.

Note: match and search match all at once.

findall(string[, pos[, endpos]])

"""
string : String to match.
pos : Optional parameter that specifies the starting position of the string. The default value is 0.
endpos : Optional parameter to specify the end position of the string. The default is the length of the string.
"""

example

import re
pattern = re.compile(r'\d+') # Find number
result1 = pattern.findall('runoob 123 google 456')
result2 = pattern.findall('run88oob123google456', 0, 10)

print(result1) # ['123', '456']
print(result2) # ['88', '12']

Multiple matching patterns, return tuple list:

import re
result = re.findall(r'(\w+)=(\d+)','set width=20 and height=10')
print(result) # [('width', '20'), ('height', '10')]

6.re.finditer

Similar to findall, all substrings matched by the regular expression are found in the string and returned as an iterator.

"""
re.finditer(pattern, string, flags=0)
pattern	Matching regular expressions
string	String to match.
flags	Flag bit, which is used to control the matching method of regular expressions, such as case sensitivity, multi line matching, etc. See: regular expression modifier - Optional flag
"""

import re

it = re.finditer(r"\d+","12a32bc43jf3")
for match in it:
    print(match.group())
# 12
# 32
# 43
# 3

7.re.split

The split method splits the string according to the substring that can be matched and returns the list. Its use form is as follows:

"""
re.split(pattern, string[, maxsplit=0, flags=0])
parameter	describe
pattern	Matching regular expressions
string	String to match.
maxsplit	Number of separations, maxsplit=1 Separate once. The default value is 0. There is no limit on the number of times.
flags	Flag bit, which is used to control the matching method of regular expressions, such as case sensitivity, multi line matching, etc. See: regular expression modifier - Optional flag
"""

#Examples

import re
s = re.split('\W+','runoob,runoob,runoob.')
print(s) # ['runoob', 'runoob', 'runoob', '']

s = re.split('(\W+)', ' runoob, runoob, runoob.')
print(s) # ['', ' ', 'runoob', ', ', 'runoob', ', ', 'runoob', '.', '']

s = re.split('\W+', ' runoob, runoob, runoob.', 1)
print(s) # ['', 'runoob, runoob, runoob.']

s = re.split('a*','hello world') # For a string that can't find a match, split won't split it. Fart, split it
print(s) # ['', 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '']

Keywords: Front-end Back-end regex

Added by hoogeebear on Sat, 12 Feb 2022 00:01:17 +0200

Programming VIP