一种文本模式,描述在搜索文本时要匹配的一个或多个字符串
正则表达式(regular expression)描述了一种字符串匹配的模式(pattern),可以用来检查一个串是否含有某种子串、将匹配的子串替换或者从某个串中取出符合某个条件的子串等。
文本分割
^
$
.
|
?
*
+
()
[]
{}
非字母数字字符 [^a-z A-Z 0-9 _ ] \W
至多m次{,m}
+?
注:可能会由于上下文不同而有所差异
import re
re.compile(r‘模式‘)
import re
text = "Tom is 8 years old. Mike is 34 years old."
pattern = re.compile('\d+') #数字,等同于[0-9] \d,1次或多次 +
pattern.findall(text) #开发的方法
re.findall('\d+', text) #这种实现也可以,实际开发常用第一种,后面的参数是指明在哪寻找
['8', '34']
['8', '34']
import re
s = '\\author:Tom'
pattern = re.compile('\\\\author') #第一种查找有转义符的方式
pattern.findall(s)
pattern = re.compile(r'\\author')#第二种查找转义符的方式
pattern.findall(s)
['\\author']
['\\author']
import re
text = 'Tom is 8 years old. Mike is 34 years old.Peter is 87 years old.'
p_name = re.compile(r'[A-Z]\w+') #字母数字字符 [a-z A-Z 0-9 _ ] \w,1次或多次 +
p_name.findall(text)
['Tom', 'Mike', 'Peter']
import re
pattern = re.compile(r'<html>')
text = '<html><head></head><body></body></html>'
pattern.match(text) #返回MatchObject对象
text2 = ' <html><head></head><body></body></html>'
pattern.match(text2, 1) #match只从开始的位置开始匹配,字符串前面有空格就不行,返回空,要设置了\开始匹配的位置,就能顺利返回查找对象
<re.Match object; span=(0, 6), match='<html>'>
<re.Match object; span=(1, 7), match='<html>'>
text2 = ' <html><head></head><body></body></html>'
pattern.search(text2) #不管什么位置都能查找
<re.Match object; span=(1, 7), match='<html>'>
import re
text = 'Tom is 8 years old. Mike is 34 years old.Peter is 87 years old.'
p1 = re.compile(r'\d+')
p2 = re.compile(r'[A-Z]\w+')
it = p1.finditer(text)
for m in it:
print(m) #返回每个查找到对象的起始与终止,还有对象
<re.Match object; span=(7, 8), match='8'>
<re.Match object; span=(28, 30), match='34'>
<re.Match object; span=(50, 52), match='87'>
import re
text = 'Tom is 8 years old. Jerry is 34 years old.'
pattern = re.compile(r'(\d+).*?(\d+)')
m = pattern.search(text)
m.group()
m.group(1) #方法里面的参数,0表示整体,1表示第一个对象,2表示第二个对象
'8 years old. Jerry is 34'
'8'
import re
pattern = re.compile(r'(\w+) (\w+)') #注意中间有空格
text = 'Beautiful is better than ugly.'
it = pattern.finditer(text)
for m in it:
print(m.group())
Beautiful is
better than
m.groups()
('8', '34')
m.start(1) #返回找到的第一个对象在元字符串的索引下标
7
m.end(1) #返回查到的第一个对象终止的下标,在这里也就是数字8后面的空格的索引
8
返回特定分组的起止索引元组
以字典表形式返回分组名及结果
import re
re.search(r'ab+c', 'ababc')
re.search(r'(ab)+c', 'ababc')
<re.Match object; span=(2, 5), match='abc'>
<re.Match object; span=(0, 5), match='ababc'>
re.search(r'Center|re', 'Center')
re.search(r'Center|re', 'Centre')
re.search(r'Cent(er|re)', 'Centre')
re.search(r'Cent(er|re)', 'Center')
<re.Match object; span=(0, 6), match='Center'>
<re.Match object; span=(4, 6), match='re'>
<re.Match object; span=(0, 6), match='Centre'>
<re.Match object; span=(0, 6), match='Center'>
re.search(r'(\w+) \1', 'hello hello worlds')
import re
text = "Tom:98"
pattern = re.compile(r'(\w+):(\d+)')
m = pattern.search(text)
m.group()
m.groups()
m.group(1)
'Tom:98'
('Tom', '98')
'Tom'
(?P模式)
pattern = re.compile(r'(?P<name>\w+):(?P<score>\d+)')
m = pattern.search(text)
m.group()
m.group('name')
m.group('score')
'Tom:98'
'Tom'
'98'
import re
text = 'Beautiful is better ugly.\nExplicit is better than implicit.\nSimple is better than complex.'
p = re.compile(r'\n')
p.split(text)
re.split(r'\n', text, 1)
['Beautiful is better ugly.',
'Explicit is better than implicit.',
'Simple is better than complex.']
['Beautiful is better ugly.',
'Explicit is better than implicit.\nSimple is better than complex.']
import re
ords = 'ORD000\nORD001\nORD003'
re.sub(r"\d+",'-', ords)
text = 'Beautiful is *better* ugly.'
re.sub(r'\*(.*?)\*','<strong>\g<1></strong>', text)
re.sub(r'\*(?P<html>.*?)\*','<strong>\g<html></strong>', text) #也可以使用这种方法
re.sub(r'([A-Z]+)(\d+)','\g<2>-\g<1>', ords)
'ORD-\nORD-\nORD-'
'Beautiful is <strong>better</strong> ugly.'
'Beautiful is <strong>better</strong> ugly.'
'000-ORD\n001-ORD\n003-ORD'
re.subn(r'([A-Z]+)(\d+)','\g<2>-\g<1>', ords)
('000-ORD\n001-ORD\n003-ORD', 3)
import re
text = 'Python python PYTHON'
re.search(r'python', text)
re.findall(r'python', text, re.I) #忽略大小写
['Python', 'python', 'PYTHON']
import re
re.findall(r'^<html>', '\n<html>', re.M)
['<html>']
re.findall(r'\d(.)','1\ne',re.S)
['\n']
原文:https://www.cnblogs.com/linyk/p/11486193.html