正则表达式四种预查
正则表达式有[正反]向[肯否]定预查四种预查形式,结论是正向只能放置于末尾,反向只能放置于开头。
参考手册:https://tool.oschina.net/uploads/apidocs/jquery/regexp.html
只看以上手册,比较难搞清楚使用范围和使用姿势,以下对这四种预查分别做单元测试。
1. 正向肯定
测试用例
def setUp(self) -> None:
self.cases = [
'123',
'a123',
'abc123',
'ad123',
'ac123',
'ae123',
'abcac123',
'abcae123',
'123abc',
'123ad',
'123ac',
'abc'
]
def test_forward_positive(self):
"""正向肯定预查,在任何匹配pattern的字符串开始处匹配查找字符串。"""
p1 = '(?=abc|ad)123' # 不能处理正则的最开头
p2 = '123(?=abc|ad)' # 只能处于末尾,必须匹配预查的字符,只是不捕获,
p3 = '(123)(abc|ad)' # 取group1后等价于p2
p4 = 'a(?=bc|d)123' # 不能处于正则的中间
for c in self.cases:
print(c)
print(p1, re.search(p1, c))
print(p2, re.search(p2, c))
print(p3, re.search(p3, c))
print(p4, re.search(p4, c))
print()
输出为如下,可以看到只有放于最后的才有效,放于最前面的p1
捕获不了abc123
,放置于中间的p4
捕获不了abc123
和ae
123。而且正确放置在最后的p2
,完全等价于p3
(对p3
的匹配取group1),无法捕获123
,不实用,不如直接用p3
。
123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
a123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
abc123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
ad123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
ac123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
ae123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
abcac123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
abcae123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
123abc
(?=abc|ad)123 None
123(?=abc|ad) <re.Match object; span=(0, 3), match='123'>
(123)(abc|ad) <re.Match object; span=(0, 6), match='123abc'>
a(?=bc|d)123 None
123ad
(?=abc|ad)123 None
123(?=abc|ad) <re.Match object; span=(0, 3), match='123'>
(123)(abc|ad) <re.Match object; span=(0, 5), match='123ad'>
a(?=bc|d)123 None
123ac
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
abc
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None
2. 正向否定
def test_forward_negative(self):
"""正向否定预查,在任何不匹配pattern的字符串开始处匹配查找字符串。"""
p1 = '(?!abc|ad)123' # 放在最前面不起作用
p2 = '123(?!abc|ad)' # 只能放置于最后,后面能为空
p3 = 'a(?!bc|d)123' # 无效,只能匹配a123
p4 = 'a(?!b|d)123' # 无效
for c in self.cases:
print(c)
print(p1, re.search(p1, c))
print(p2, re.search(p2, c))
print(p3, re.search(p3, c))
print(p4, re.search(p4, c))
print()
放在最前面的p1
捕获了abc123
不符合预期,放在中间的p3
和p4
写了等于没写,只有放在最后的p2正确匹配了,并且后面可以为空(匹配了123
)。
123
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) <re.Match object; span=(0, 3), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
a123
(?!abc|ad)123 <re.Match object; span=(1, 4), match='123'>
123(?!abc|ad) <re.Match object; span=(1, 4), match='123'>
a(?!bc|d)123 <re.Match object; span=(0, 4), match='a123'>
a(?!b|d)123 <re.Match object; span=(0, 4), match='a123'>
abc123
(?!abc|ad)123 <re.Match object; span=(3, 6), match='123'>
123(?!abc|ad) <re.Match object; span=(3, 6), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
ad123
(?!abc|ad)123 <re.Match object; span=(2, 5), match='123'>
123(?!abc|ad) <re.Match object; span=(2, 5), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
ac123
(?!abc|ad)123 <re.Match object; span=(2, 5), match='123'>
123(?!abc|ad) <re.Match object; span=(2, 5), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
ae123
(?!abc|ad)123 <re.Match object; span=(2, 5), match='123'>
123(?!abc|ad) <re.Match object; span=(2, 5), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
abcac123
(?!abc|ad)123 <re.Match object; span=(5, 8), match='123'>
123(?!abc|ad) <re.Match object; span=(5, 8), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
abcae123
(?!abc|ad)123 <re.Match object; span=(5, 8), match='123'>
123(?!abc|ad) <re.Match object; span=(5, 8), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
123abc
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) None
a(?!bc|d)123 None
a(?!b|d)123 None
123ad
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) None
a(?!bc|d)123 None
a(?!b|d)123 None
123ac
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) <re.Match object; span=(0, 3), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None
abc
(?!abc|ad)123 None
123(?!abc|ad) None
a(?!bc|d)123 None
a(?!b|d)123 None
3. 反向肯定
def test_backward_positive(self):
"""反向肯定"""
# p1 = '(?<=abc|ad)123' # 报错, re.error: look-behind requires fixed-width pattern
p1 = '((?<=abc)|(?<=ad|ac))123' # 只能处于最前面,等价于(abc|ad|ac)123
p2 = '123((?<=abc)|(?<=ad))' # 无效
p3 = 'a((?<=bc)|(?<=d))123' # 错误用法
p4 = '123(?<=abc)' # 无效
for c in self.cases:
print(c)
print(p1, re.search(p1, c))
print(p2, re.search(p2, c))
print(p3, re.search(p3, c))
print(p4, re.search(p4, c))
print()
反向的时候需要注意,如果预查多个,不能之前那样写(?<=abc|ad)
,同一个预查里的长度必须是相等的。若想处理多个,得写过个预查((?<=abc)|(?<=ad|ac))
。
此外,反向只能放置在最前面,放置在中间完全不符合预期,放在后面也未生效。但正确的写法p1
也等价于(abc|ad|ac)123
,用预查写起来还麻烦,不如直接group筛选。
123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
a123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
abc123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(3, 6), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
ad123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
ac123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
ae123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
abcac123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(5, 8), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
abcae123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
123abc
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
123ad
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
123ac
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
abc
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None
4. 反向否定
def test_backward_negative(self):
"""反向否定"""
p1 = '((?<!abc)|(?<!ad|ac))123' # 错误写法,只要满足不等于其中一个即可
p2 = '123((?<!abc)|(?<!ad))' #
p3 = 'a((?<!bc)|(?<!d))123' # 错误写法,只能匹配a123
p4 = '123(?<!abc)' # 无效
p5 = '(?<!abc)123' #
p6 = '(?<!abc)(?<!ad|ac)123' # 正确写法,必须满足全部不等于
for c in self.cases:
print(c)
print(p1, re.search(p1, c))
print(p2, re.search(p2, c))
print(p3, re.search(p3, c))
print(p4, re.search(p4, c))
print(p5, re.search(p5, c))
print(p6, re.search(p6, c))
print()
同上,只有放置在最前面的是正确的,这里需要注意的事p1
和p6
。p1
只要不等于其一都可匹配,比如abc123
不匹配ad|ac
,这明显不符合预期。这个时候需要取并集,只需要拆开写过个就好,顺序无关,如见p6
。
123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>
a123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(1, 4), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(1, 4), match='123'>
a((?<!bc)|(?<!d))123 <re.Match object; span=(0, 4), match='a123'>
123(?<!abc) <re.Match object; span=(1, 4), match='123'>
(?<!abc)123 <re.Match object; span=(1, 4), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(1, 4), match='123'>
abc123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(3, 6), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(3, 6), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(3, 6), match='123'>
(?<!abc)123 None
(?<!abc)(?<!ad|ac)123 None
ad123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(2, 5), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(2, 5), match='123'>
(?<!abc)123 <re.Match object; span=(2, 5), match='123'>
(?<!abc)(?<!ad|ac)123 None
ac123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(2, 5), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(2, 5), match='123'>
(?<!abc)123 <re.Match object; span=(2, 5), match='123'>
(?<!abc)(?<!ad|ac)123 None
ae123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(2, 5), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(2, 5), match='123'>
(?<!abc)123 <re.Match object; span=(2, 5), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(2, 5), match='123'>
abcac123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(5, 8), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(5, 8), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(5, 8), match='123'>
(?<!abc)123 <re.Match object; span=(5, 8), match='123'>
(?<!abc)(?<!ad|ac)123 None
abcae123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(5, 8), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(5, 8), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(5, 8), match='123'>
(?<!abc)123 <re.Match object; span=(5, 8), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(5, 8), match='123'>
123abc
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>
123ad
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>
123ac
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>
abc
((?<!abc)|(?<!ad|ac))123 None
123((?<!abc)|(?<!ad)) None
a((?<!bc)|(?<!d))123 None
123(?<!abc) None
(?<!abc)123 None
(?<!abc)(?<!ad|ac)123 None
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!