@[TOC] table of contents
according to Rookie tutorial Add a few personal understanding and finishing, which can be used to review the knowledge points. For details, please refer to the original link
1. re.match function re match(pattern,string,flags=0)
re.match only matches the beginning of the string. If the beginning of the string does not match the regular expression, the matching fails and the function returns None;
And re Search matches the entire string until a match is found.
import re print(re.match('www','www.baidu.com').span()) # (0, 3) print(re.match('www','www.baidu.com')) # <re.Match object; span=(0, 3), match='www'> print(re.match('com','www.baidu.com')) # Not at the starting position. Can't match None
The string of the entire expression matched by the matching object method group(num=0),
group() can enter more than one group number at a time, in which case it will return a tuple containing the values corresponding to those groups.
groups() returns a tuple containing all group strings, from 1 to the group number contained.
line = "Cats are smarter than dogs" matchObject = re.match(r'(.*) are (.*?) .*',line,re.M|re.I)
The previous r indicates that the string is a non escaped original string, which allows the compiler to ignore backslashes, that is, escape characters.
But there is no backslash in this string, so this r is optional.
if matchObject: print('matchObject.group():',matchObject.group()) # matchObject.group(): Cats are smarter than dogs print('matchObject.group(1):',matchObject.group(1)) # matchObject.group(1): Cats print('matchObject.group(2):',matchObject.group(2)) # matchObject.group(2): smarter print('matchObject.groups():',matchObject.groups()) # matchObject.groups(): ('Cats', 'smarter') else: print('no match!!')
2. re.search(pattern,string,flags=0)
re.search scans the entire string and returns the first successful match.
print(re.search('www','www.baidu.com').span()) # (0, 3) print(re.search('www','www.baidu.com')) # <re.Match object; span=(0, 3), match='www'> print(re.search('com','www.baidu.com').span()) # (10, 13) print(re.search('com','www.baidu.com')) # <re.Match object; span=(10, 13), match='com'> line = "Cats are smarter than dogs" searchObject = re.search(r'(.*) are (.*?) .*',line,re.M|re.I) if matchObject: print('searchObject.group():',searchObject.group()) # searchObject.group(): Cats are smarter than dogs print('searchObject.group(1):',searchObject.group(1)) # searchObject.group(1): Cats print('searchObject.group(2):',searchObject.group(2)) # searchObject.group(2): smarter print('searchObject.groups():',searchObject.groups()) # searchObject.groups(): ('Cats', 'smarter') else: print('nothing found!!')
3. re.sub(pattern, repl, string, count=0, flags=0)
Replace matches in string
""" pattern : Pattern string in regular. repl : The replaced string can also be a function. string : The original string to be found and replaced. count : The maximum number of times to replace after pattern matching. By default, 0 means to replace all matches. """
import re phone = "2004-959-559 # This is a foreign telephone number“
Delete non numeric (-) strings
num = re.sub(r'\D',"",phone) print('The phone number is:',num) # The telephone number is 2004959559
Remove Python comments from the string (note that the first method has a space after the phone number)
num = re.sub(r'#.*$',"",phone) print('The phone number is:',num) # The telephone number is 2004-959-559 num = re.sub(r' #.*$',"",phone) print('The phone number is:',num) # The telephone number is 2004-959-559
The repl parameter is a function
Multiply the matching number by 2
import re def double(matched): value = int(matched.group('name')) return str(value * 2) s = 'A23G4HFD567' print(re.sub('(?P<name>\d+)',double,s)) # A46G8HFD1134 # ? P < name > means to name a group named, and the matching rule conforms to the following / d+
4.re.compile # (my understanding: another form of matching. First compile the regular expression, and then match it through match() or search())
The compile function is used to compile regular expressions and generate a regular expression (Pattern) object,
Key: used by match() and search().
re.compile(pattern[, flags])
""" pattern : A regular expression in string form flags : Optional, indicating the matching mode, such as ignoring case, multiline mode, etc. the specific parameters are: re.I ignore case re.L Represents a special character set \w, \W, \b, \B, \s, \S Dependent on current environment re.M Multiline mode re.S mean . And any character including a newline character(. (excluding line breaks) re.U Represents a special character set \w, \W, \b, \B, \d, \D, \s, \S Depend on Unicode Character attribute database re.X To increase readability, spaces and are ignored # Later notes """
import re pattern = re.compile(r'\d+') # Used to match at least one number m = pattern.match('one12twothree34four') print(m) # None find header does not match m = pattern.match('one12twothree34four',2,10) print(m) # None matches from the position of 'e', there is no match m = pattern.match('one12twothree34four',3,10) print(m) # <re.Match object; span=(3, 5), match='12'> # Start matching from the position of '1', and it just matches m = pattern.search('one12twothree34four') print(m) #<re.Match object; span=(3, 5), match='12'> print(m.group(0),m.start(0),m.end(0),m.span(0)) # 12 3 5 (3, 5) 0 in brackets can be omitted
#Example 2
import re pattern = re.compile(r'([a-z]+) ([a-z]+)',re.I) # re.I means ignore case m = pattern.match('Hello World Wide Web') m2 = pattern.search('Hello World Wide Web') print(m) # <re.Match object; span=(0, 11), match='Hello World'> print(m2) # <re.Match object; span=(0, 11), match='Hello World'> print(m.group(),m.span(),m.group(1),m.span(1),m.group(2),m.span(2)) # Hello World # (0, 11) # Hello # (0, 5) # World # (6, 11) print(m.groups()) # ('Hello', 'World') #print(m.group(3)) # An error is reported. There is no third group
5.findall
Find all substrings matched by the regular expression in the string and return a list,
If there are multiple matching patterns, the tuple list is returned. If no matching pattern is found, the empty list is returned.
Note: match and search match all at once.
findall(string[, pos[, endpos]])
""" string : String to match. pos : Optional parameter that specifies the starting position of the string. The default value is 0. endpos : Optional parameter to specify the end position of the string. The default is the length of the string. """
example
import re pattern = re.compile(r'\d+') # Find number result1 = pattern.findall('runoob 123 google 456') result2 = pattern.findall('run88oob123google456', 0, 10) print(result1) # ['123', '456'] print(result2) # ['88', '12']
Multiple matching patterns, return tuple list:
import re result = re.findall(r'(\w+)=(\d+)','set width=20 and height=10') print(result) # [('width', '20'), ('height', '10')]
6.re.finditer
Similar to findall, all substrings matched by the regular expression are found in the string and returned as an iterator.
""" re.finditer(pattern, string, flags=0) pattern Matching regular expressions string String to match. flags Flag bit, which is used to control the matching method of regular expressions, such as case sensitivity, multi line matching, etc. See: regular expression modifier - Optional flag """
import re it = re.finditer(r"\d+","12a32bc43jf3") for match in it: print(match.group()) # 12 # 32 # 43 # 3
7.re.split
The split method splits the string according to the substring that can be matched and returns the list. Its use form is as follows:
""" re.split(pattern, string[, maxsplit=0, flags=0]) parameter describe pattern Matching regular expressions string String to match. maxsplit Number of separations, maxsplit=1 Separate once. The default value is 0. There is no limit on the number of times. flags Flag bit, which is used to control the matching method of regular expressions, such as case sensitivity, multi line matching, etc. See: regular expression modifier - Optional flag """
#Examples
import re s = re.split('\W+','runoob,runoob,runoob.') print(s) # ['runoob', 'runoob', 'runoob', ''] s = re.split('(\W+)', ' runoob, runoob, runoob.') print(s) # ['', ' ', 'runoob', ', ', 'runoob', ', ', 'runoob', '.', ''] s = re.split('\W+', ' runoob, runoob, runoob.', 1) print(s) # ['', 'runoob, runoob, runoob.'] s = re.split('a*','hello world') # For a string that can't find a match, split won't split it. Fart, split it print(s) # ['', 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '']