正则表达式四种预查

正则表达式有[正反]向[肯否]定预查四种预查形式,结论是正向只能放置于末尾,反向只能放置于开头。

参考手册:https://tool.oschina.net/uploads/apidocs/jquery/regexp.html

只看以上手册,比较难搞清楚使用范围和使用姿势,以下对这四种预查分别做单元测试。

1. 正向肯定

测试用例

def setUp(self) -> None:
  self.cases = [
    '123',
    'a123',
    'abc123',
    'ad123',
    'ac123',
    'ae123',
    'abcac123',
    'abcae123',
    '123abc',
    '123ad',
    '123ac',
    'abc'
  ]

def test_forward_positive(self):
    """正向肯定预查,在任何匹配pattern的字符串开始处匹配查找字符串。"""
    p1 = '(?=abc|ad)123'  # 不能处理正则的最开头
    p2 = '123(?=abc|ad)'  # 只能处于末尾,必须匹配预查的字符,只是不捕获,
    p3 = '(123)(abc|ad)'  # 取group1后等价于p2
    p4 = 'a(?=bc|d)123'  # 不能处于正则的中间

    for c in self.cases:
      print(c)
      print(p1, re.search(p1, c))
      print(p2, re.search(p2, c))
      print(p3, re.search(p3, c))
      print(p4, re.search(p4, c))
      print()

输出为如下,可以看到只有放于最后的才有效,放于最前面的p1捕获不了abc123,放置于中间的p4捕获不了abc123ae123。而且正确放置在最后的p2,完全等价于p3(对p3的匹配取group1),无法捕获123,不实用,不如直接用p3

123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

a123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

abc123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

ad123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

ac123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

ae123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

abcac123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

abcae123
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

123abc
(?=abc|ad)123 None
123(?=abc|ad) <re.Match object; span=(0, 3), match='123'>
(123)(abc|ad) <re.Match object; span=(0, 6), match='123abc'>
a(?=bc|d)123 None

123ad
(?=abc|ad)123 None
123(?=abc|ad) <re.Match object; span=(0, 3), match='123'>
(123)(abc|ad) <re.Match object; span=(0, 5), match='123ad'>
a(?=bc|d)123 None

123ac
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

abc
(?=abc|ad)123 None
123(?=abc|ad) None
(123)(abc|ad) None
a(?=bc|d)123 None

2. 正向否定

def test_forward_negative(self):
    """正向否定预查,在任何不匹配pattern的字符串开始处匹配查找字符串。"""
    p1 = '(?!abc|ad)123'  # 放在最前面不起作用
    p2 = '123(?!abc|ad)'  # 只能放置于最后,后面能为空
    p3 = 'a(?!bc|d)123'  # 无效,只能匹配a123
    p4 = 'a(?!b|d)123'  # 无效

    for c in self.cases:
        print(c)
        print(p1, re.search(p1, c))
        print(p2, re.search(p2, c))
        print(p3, re.search(p3, c))
        print(p4, re.search(p4, c))
        print()

放在最前面的p1捕获了abc123不符合预期,放在中间的p3p4写了等于没写,只有放在最后的p2正确匹配了,并且后面可以为空(匹配了123)。

123
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) <re.Match object; span=(0, 3), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

a123
(?!abc|ad)123 <re.Match object; span=(1, 4), match='123'>
123(?!abc|ad) <re.Match object; span=(1, 4), match='123'>
a(?!bc|d)123 <re.Match object; span=(0, 4), match='a123'>
a(?!b|d)123 <re.Match object; span=(0, 4), match='a123'>

abc123
(?!abc|ad)123 <re.Match object; span=(3, 6), match='123'>
123(?!abc|ad) <re.Match object; span=(3, 6), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

ad123
(?!abc|ad)123 <re.Match object; span=(2, 5), match='123'>
123(?!abc|ad) <re.Match object; span=(2, 5), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

ac123
(?!abc|ad)123 <re.Match object; span=(2, 5), match='123'>
123(?!abc|ad) <re.Match object; span=(2, 5), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

ae123
(?!abc|ad)123 <re.Match object; span=(2, 5), match='123'>
123(?!abc|ad) <re.Match object; span=(2, 5), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

abcac123
(?!abc|ad)123 <re.Match object; span=(5, 8), match='123'>
123(?!abc|ad) <re.Match object; span=(5, 8), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

abcae123
(?!abc|ad)123 <re.Match object; span=(5, 8), match='123'>
123(?!abc|ad) <re.Match object; span=(5, 8), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

123abc
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) None
a(?!bc|d)123 None
a(?!b|d)123 None

123ad
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) None
a(?!bc|d)123 None
a(?!b|d)123 None

123ac
(?!abc|ad)123 <re.Match object; span=(0, 3), match='123'>
123(?!abc|ad) <re.Match object; span=(0, 3), match='123'>
a(?!bc|d)123 None
a(?!b|d)123 None

abc
(?!abc|ad)123 None
123(?!abc|ad) None
a(?!bc|d)123 None
a(?!b|d)123 None

3. 反向肯定

def test_backward_positive(self):
    """反向肯定"""
    # p1 = '(?<=abc|ad)123'  # 报错, re.error: look-behind requires fixed-width pattern
    p1 = '((?<=abc)|(?<=ad|ac))123'  # 只能处于最前面,等价于(abc|ad|ac)123
    p2 = '123((?<=abc)|(?<=ad))'  # 无效
    p3 = 'a((?<=bc)|(?<=d))123'  # 错误用法
    p4 = '123(?<=abc)'  # 无效

    for c in self.cases:
        print(c)
        print(p1, re.search(p1, c))
        print(p2, re.search(p2, c))
        print(p3, re.search(p3, c))
        print(p4, re.search(p4, c))
        print()

反向的时候需要注意,如果预查多个,不能之前那样写(?<=abc|ad),同一个预查里的长度必须是相等的。若想处理多个,得写过个预查((?<=abc)|(?<=ad|ac))

此外,反向只能放置在最前面,放置在中间完全不符合预期,放在后面也未生效。但正确的写法p1也等价于(abc|ad|ac)123,用预查写起来还麻烦,不如直接group筛选。

123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

a123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

abc123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(3, 6), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

ad123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

ac123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

ae123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

abcac123
((?<=abc)|(?<=ad|ac))123 <re.Match object; span=(5, 8), match='123'>
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

abcae123
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

123abc
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

123ad
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

123ac
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

abc
((?<=abc)|(?<=ad|ac))123 None
123((?<=abc)|(?<=ad)) None
a((?<=bc)|(?<=d))123 None
123(?<=abc) None

4. 反向否定

def test_backward_negative(self):
    """反向否定"""
    p1 = '((?<!abc)|(?<!ad|ac))123'  # 错误写法,只要满足不等于其中一个即可
    p2 = '123((?<!abc)|(?<!ad))'  #
    p3 = 'a((?<!bc)|(?<!d))123'  # 错误写法,只能匹配a123
    p4 = '123(?<!abc)'  # 无效
    p5 = '(?<!abc)123'  #
    p6 = '(?<!abc)(?<!ad|ac)123'  # 正确写法,必须满足全部不等于

    for c in self.cases:
        print(c)
        print(p1, re.search(p1, c))
        print(p2, re.search(p2, c))
        print(p3, re.search(p3, c))
        print(p4, re.search(p4, c))
        print(p5, re.search(p5, c))
        print(p6, re.search(p6, c))
        print()

同上,只有放置在最前面的是正确的,这里需要注意的事p1p6p1只要不等于其一都可匹配,比如abc123不匹配ad|ac,这明显不符合预期。这个时候需要取并集,只需要拆开写过个就好,顺序无关,如见p6

123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>

a123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(1, 4), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(1, 4), match='123'>
a((?<!bc)|(?<!d))123 <re.Match object; span=(0, 4), match='a123'>
123(?<!abc) <re.Match object; span=(1, 4), match='123'>
(?<!abc)123 <re.Match object; span=(1, 4), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(1, 4), match='123'>

abc123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(3, 6), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(3, 6), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(3, 6), match='123'>
(?<!abc)123 None
(?<!abc)(?<!ad|ac)123 None

ad123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(2, 5), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(2, 5), match='123'>
(?<!abc)123 <re.Match object; span=(2, 5), match='123'>
(?<!abc)(?<!ad|ac)123 None

ac123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(2, 5), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(2, 5), match='123'>
(?<!abc)123 <re.Match object; span=(2, 5), match='123'>
(?<!abc)(?<!ad|ac)123 None

ae123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(2, 5), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(2, 5), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(2, 5), match='123'>
(?<!abc)123 <re.Match object; span=(2, 5), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(2, 5), match='123'>

abcac123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(5, 8), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(5, 8), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(5, 8), match='123'>
(?<!abc)123 <re.Match object; span=(5, 8), match='123'>
(?<!abc)(?<!ad|ac)123 None

abcae123
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(5, 8), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(5, 8), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(5, 8), match='123'>
(?<!abc)123 <re.Match object; span=(5, 8), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(5, 8), match='123'>

123abc
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>

123ad
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>

123ac
((?<!abc)|(?<!ad|ac))123 <re.Match object; span=(0, 3), match='123'>
123((?<!abc)|(?<!ad)) <re.Match object; span=(0, 3), match='123'>
a((?<!bc)|(?<!d))123 None
123(?<!abc) <re.Match object; span=(0, 3), match='123'>
(?<!abc)123 <re.Match object; span=(0, 3), match='123'>
(?<!abc)(?<!ad|ac)123 <re.Match object; span=(0, 3), match='123'>

abc
((?<!abc)|(?<!ad|ac))123 None
123((?<!abc)|(?<!ad)) None
a((?<!bc)|(?<!d))123 None
123(?<!abc) None
(?<!abc)123 None
(?<!abc)(?<!ad|ac)123 None

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!