多个通配符表达式的最短字符串匹配

3

我有多个类似的通配符表达式:

*a*b*
*c*d*
*e*?*

where

  • *表示0或多个字母(它们可以是任何字母,不一定相同)
  • ?表示任何单个字母的出现

我需要找到与通配符模式匹配的最短字符串。例如,在上面的示例中,其中一个字符串将是:

abced

另一个例子:

?a*b
a*b*
*a??a*

结果将会是:
aa?ab -> meaning "aaaab" OR "aabab" OR ...

我想我需要在这里使用动态规划。尝试了一些方法,但只得到了部分结果。有什么想法吗?


2
*表示0或多个任意字母” - 这有点含糊不清 - 它必须是0或多个相同的字母吗? - kaya3
6
第一件需要考虑的事情是将通配符表达式转换为DFA,然后取DFAs的笛卡尔积,并使用BFS从起始节点到接受节点找到最短路径。DFA的构建比一般的正则表达式简单,因为没有“联合”,所以不需要通过NFA来进行。由于它是隐式图,因此可以在不显式构造笛卡尔积的情况下进行BFS。 - kaya3
4
@kaya3所说的没错,但A比BFS更好用,而且由于存在像a?a这样的模式,你必须从NFAs开始,或者至少做一些比贪婪匹配更多的工作。 - Matt Timmermans
@MattTimmermans 是的,那是一个好的反例。直觉上,禁止联合应该比正则表达式更容易解决问题,但我猜想,我的猜测是可以通过某种直接构造DFA的方式来实现。我还没有清晰的想法如何做到这一点。 - kaya3
1
是的,这个问题通过归约例如最大独立集合,是NP难问题,所以无论如何在最坏情况下都是指数级别的。 - Matt Timmermans
显示剩余13条评论
5个回答

4

对于较小的示例可以运行,但是对于较大的示例需要太长时间。可能是因为我使用数组计算转换。

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Queue;
import java.util.Set;

public class Safe {
  public static void main(String[] args) {
    String[] patterns = new String[3];
    patterns[0] = "?a*b";
    patterns[1] = "a*b*";
    patterns[2] = "*a??a*";
    String sol = new Safe().solution(patterns);

    System.out.println(sol);
    if(!sol.equals("aabab")){
      throw new RuntimeException("Not Equal!");
    }

    patterns = new String[3];
    patterns[0] = "*a*b*";
    patterns[1] = "*c*d*";
    patterns[2] = "*e*f*";
    System.out.println(new Safe().solution(patterns));

    patterns = new String[3];
    patterns[0] = "*p?qp?*bd*pd?q*";
    patterns[1] = "*qp*d?b*?p?d*";
    patterns[2] = "?*d?b???q*q?p*";
    sol = new Safe().solution(patterns);

    System.out.println(sol);
    if(!sol.equals("p?qpd?bdpdqqppd")){
      throw new RuntimeException("Not Equal!");
    }

    patterns = new String[2];
    patterns[0] = "*z*y*y*z*x*x*x*z*x*z*y*z*y*x*x*y*y*y*z*x*y*z*y*x*x*x*z*x*z*z*z*y*y*z*x*y*z*";
    patterns[1] = "*y*z*z*x*x*y*z*y*z*y*x*z*y*z*y*x*z*y*x*y*x*y*y*z*x*y*z*x*x*z*y*z*y*y*x*z*y*";

    sol = new Safe().solution(patterns);

    System.out.println(sol);
    if(!sol.equals("yzyyzxxyzyxzyxzyzyxzyxyxyyzxyzyxxxzyxzzzyyxzxyz")){
      throw new RuntimeException("Not Equal!");
    }
  }

  private boolean isItLetter(char c) {
    return c != '*' && c != '?';
  }

  private char match(String[] patterns, int[] indices) {
    boolean firstThrown = true;
    try {
      char matched = '*';
      for (int i = 0; i < patterns.length; i++) {
        char c = patterns[i].charAt(indices[i]);
        firstThrown = false;
        if (isItLetter(c)) {
          if (isItLetter(matched)) {
            if (matched != c) {
              //two different letters
              //so we cannot continue
              return Character.MIN_VALUE;
            }
          }
          matched = c;
        } else {
          //* or ?
          if (!isItLetter(matched)) {
            //so we do not overwrite ? with *
            if (c == '?') {
              matched = c;
            }
          }
        }
      }
      return matched;
    } catch (StringIndexOutOfBoundsException e) {
      //means we tried matching end of string
      //check if all are at the end
      if (firstThrown) {
        //check only if first thrown;
        //otherwise, one of patterns is not at the end
        for (int i = 1; i < patterns.length; i++) {
          if (indices[i] < patterns[i].length()) {
            return Character.MIN_VALUE;
          }
        }
      } else {
        return Character.MIN_VALUE;
      }
      //max when we reached the end
      return Character.MAX_VALUE;
    }
  }

  class Node{
    final String recognized;
    final int[] indices;

    public Node(String recognized, int[] indices) {
      this.recognized = recognized;
      this.indices = indices;
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      if (o == null || getClass() != o.getClass()) return false;
      Node node = (Node) o;
      return recognized.equals(node.recognized) &&
          Arrays.equals(indices, node.indices);
    }

    @Override
    public int hashCode() {
      int result = Objects.hash(recognized);
      result = 31 * result + Arrays.hashCode(indices);
      return result;
    }
  }

  Queue<Node> queue = new ArrayDeque<>();
  Set<Node> knownStates = new HashSet<>();

  public String solution(String[] patterns){
    List<int[]> startingIndices = generateStartingIndices(patterns, new int[patterns.length]);

    for(int[] si : startingIndices){
      Node nextNode = new Node("", si);
      queue.offer(nextNode);
    }

    String result = null;
    while (result == null) {
      result = solution(patterns, queue.poll());
    }
    return result;
  }

  private List<int[]> generateStartingIndices(String[] patterns, int[] indices){
    List<int[]> generated = Collections.singletonList(new int[patterns.length]);
    for(int i=0; i<patterns.length; i++){
      char currentChar = patterns[i].charAt(indices[i]);
      List<int[]> replaceGenerated = new ArrayList<>();
      if(currentChar == '*'){
        for(int[] gi : generated){
          gi[i] = indices[i]+1;
          replaceGenerated.add(gi);
        }
        for(int[] gi : generated){
          int[] gin = gi.clone();
          gin[i] = indices[i];
          replaceGenerated.add(gin);
        }
      }
      else{
        for(int[] gi : generated){
          gi[i] = indices[i];
          replaceGenerated.add(gi);
        }
      }
      generated = replaceGenerated;
    }
    return generated;
  }

  private List<int[]> generateNextIndices(String[] patterns, int[] indices){
    List<int[]> generated = Collections.singletonList(new int[patterns.length]);
    for(int i=0; i<patterns.length; i++){
      char currentChar = patterns[i].charAt(indices[i]);
      char nextChar = Character.MIN_VALUE;
      if(indices[i]+1 < patterns[i].length()){
        nextChar = patterns[i].charAt(indices[i]+1);
      }
      List<int[]> replaceGenerated = new ArrayList<>();
      if(currentChar == '*'){
        //or next index
        for(int[] gi : generated){
          gi[i] = indices[i]+1; //short it first so we first test that case
          replaceGenerated.add(gi);
        }
        //same index or
        for(int[] gi : generated){
          int[] gin = gi.clone();
          gin[i] = indices[i];
          replaceGenerated.add(gin);
        }
      }
      else{
        //some character
        if(nextChar=='*'){
          //or if * we can skip
          for(int[] gi : generated){
            gi[i] = indices[i]+2; //skip first so we check that shorter case
            replaceGenerated.add(gi);
          }
        }
        //we can go next or
        for(int[] gi : generated){
          int[] gin = gi.clone();
          gin[i] = indices[i]+1;
          replaceGenerated.add(gin);
        }
      }
      generated = replaceGenerated;
    }
    return generated;
  }

  public String solution(String[] patterns, Node node) {
    char matched = match(patterns, node.indices);

    if (matched == Character.MAX_VALUE) {
      //we reached the end
      return node.recognized;
    }
    if (matched == Character.MIN_VALUE) {
      //impossible to match
      return null;
    }
    if(matched == '*'){
      //all stars
      return null;
    }

    List<int[]> nextIndices = generateNextIndices(patterns, node.indices);

    for(int[] ni : nextIndices){
        Node nextNode = new Node(node.recognized + matched, ni);
        if(notKnownState(nextNode)) {
          queue.offer(nextNode);
        }
    }

    return null;
  }

  private boolean notKnownState(Node node) {
    if(knownStates.contains(node)){
      return false;
    }
    knownStates.add(node);
    return true;
  }
}

3
这里有一种不使用DFA的方法。
我考虑了仅测试给定长度k的字符串是否满足所有模式,并返回其中一个的问题。这是一个稍微简单的问题,但我们可以通过逐步增加k来查找最短的满足每个模式的字符串,直到找到解决方案(或者直到k超过最短解决方案的长度上限)。
我的解决方案无法在10-20秒内解决更大的示例,但也许这个想法仍然有用。我在Python中使用了Z3定理证明器,但也有许多其他语言的绑定。
基本上,这个想法是定义表示字符串中字母的k个正式变量,然后构建一个巨大的逻辑表达式来编码模式,并使用Z3求解器搜索解决方案(或验证不存在解决方案)。
有一些额外的技巧可以提高效率。我们可以通过简化模式来开始;最好在*之前有?以减少分支因子,并且两个或更多*的任何序列都是冗余的。我们还可以根据模式中非*符号计算出解决方案的最小可能长度。
原则上,相同的想法可以用于测试长度<=k的字符串,并优化最短的字符串,方法是引入$符号到字母表中,添加约束条件,即$不能出现在除$以外的任何符号之前,然后解决最大化$符号数量的优化问题。我没有尝试过这个;还有其他一些简单的改进可以做,但我试图保持它作为概念证明的简单性。
我的实现如下。影响性能的主要问题是,逻辑表达式的大小与给定模式中*的数量呈指数关系。
from z3 import *

def simplify_pattern(p):
    q = ''
    while p != q:
        q = p
        p = p.replace('*?', '?*').replace('**', '*')
    return p

def min_length(p):
    return len(p.replace('*', ''))

def solve(patterns, max_length):
    patterns = [simplify_pattern(p) for p in patterns]
    m = max(map(min_length, patterns))
    for k in range(m, max_length+1):
        s = solve_for_length(patterns, k)
        if s is not None:
            return s
    return None

def solve_for_length(patterns, k):
    alphabet = sorted(set(''.join(patterns)) - set('?*'))
    vs = [Int('v' + str(i)) for i in range(k)]
    constraints = [And(0 <= v, v < len(alphabet)) for v in vs]
    for p in patterns:
        cs = constraints_for_pattern(p, vs, alphabet)
        constraints.extend(cs)

    s = Solver()
    s.add(constraints)
    if s.check() == sat:
        m = s.model()
        idx = [m.eval(v).as_long() for v in vs]
        return ''.join(alphabet[i] for i in idx)
    else:
        return None

def constraints_for_pattern(pattern, vs, alphabet):
    while pattern and pattern[0] != '*':
        if not vs:
            # output string shorter than pattern
            yield False
            return

        if pattern[0] != '?':
            yield vs[0] == alphabet.index(pattern[0])
        pattern = pattern[1:]
        vs = vs[1:]

    if pattern == '*':
        pass
    elif pattern:
        # pattern starts with '*' but != '*'
        p = pattern[1:]
        yield Or(*[
            And(*constraints_for_pattern(p, vs[i:], alphabet))
            for i in range(len(vs) - min_length(p) + 1)
        ])
    elif vs:
        # output string longer than pattern
        yield False

示例:

>>> solve(['a?', '?b'], 2)
'ab'
>>> solve(['a*', '*b'], 2)
'ab'
>>> solve(['a*', '?b*', '*c'], 3)
'abc'
>>> solve(['a*a*a*', '*b*b*b'], 5) is None
True
>>> solve(['a*a*a*', '*b*b*b'], 6)
'ababab'
>>> solve(['*a*b*', '*c*d*', '*e*?*'], 10)
'eabcd'
>>> solve(['?a*b', 'a*b*', '*a??a*'], 10)
'aaaab'
>>> mini_5 = [
...     "*i*o*?*?*u?*??*e?o*e*a*a*i*ee*",
...     "*e*?ue*o*i*?*e*u*i*?*oa?*??*",
...     "*?oi*i??uu*a*iu*",
...     "*o*e*ea?*eu*?e*",
...     "*u*?oe*u*e?*e?*",
... ]
>>> solve(mini_5, 33) # ~20 seconds on my machine
'ioiiueuueaoeiueuiaoaiee'

一个问题 - 如何推断出 ?a*b, a*b*, *a??a* 中的最小长度是 5,因为如果我只计算非星号字符,它们分别是 324,而交集问题的最小长度是 5? 我猜如果我知道最小长度,就可以在我的例子中使用DFS和回溯了? - Bojan Vukasovic
1
min_length 将会给出 4;这里的 max_length 参数是 5。求解器将首先尝试 4,找不到解决方案,然后尝试 5。min_length 只是一个较弱的下限。 - kaya3
好的。所以实际上是从参数开始查找算法的起点。 - Bojan Vukasovic

0

有一个正式的算法可以得到最优解。

首先将每个正则表达式转换为确定性有限状态自动机(DFA)。例如,您可以使用 Thompson's construction获得NFA,然后使用 subset construction获得DFA。但是还有其他更直接的方法。

实际上,这些不是带有选择的完整正则表达式,因此构建DFA尤其简单。

对于这组DFA,构建交叉积DFA的“交集”版本。这是一个众所周知的算法。您最终会得到一个DFA,它接受所有原始字符串的精确匹配。

然后从该DFA的起始状态开始进行广度优先搜索,以查找最短的接受字符串(或验证是否没有)。

运行时间将与从原始正则表达式构建的DFA大小的乘积成比例。对于实际上几乎所有时间出现的正则表达式而言,它们在正则表达式大小方面是线性的。但在奇怪的病态情况下,它们可能会呈指数级增长。这就是最优性的代价。

虽然这听起来有些棘手,但这些算法并不复杂。如果正则表达式的数量不太大,并且它们不是奇怪的病态情况,那么这种方法将非常实用。但是,这不是你一个下午就能完成的事情。


谢谢。根据您的回答,我成功地进行了NFA->DFA->逐个状态交集,并进行了BFS。但是对于我列表中的最后一个示例,这种方法似乎仍然太慢(永远不会结束)。 - Bojan Vukasovic
@BojanVukasovic,你能分享一下列表吗? - גלעד ברקן
这是我的上一个回答中提到的字符串数组,其中包含95个模式。 - Bojan Vukasovic

0

好的,我甚至成功创建了NFA,然后将其转换为DFA,并构建所有DFA的交集,同时使用BFS访问它们。对于前三个测试代码样本,它运行得很快,但是最后两个似乎对算法处理过于复杂。如果我使用A*,我可以得到一些解决方案,但问题在于它不是最短的!

import java.util.*;
import java.util.stream.Collectors;

public class Safe4 {
    public static void main(String[] args) {
        String[] patterns = new String[3];
        patterns[0] = "?a*b";
        patterns[1] = "a*b*";
        patterns[2] = "*a??a*";

        //aaaab
        System.out.println(new Safe4().solution(patterns));

        patterns = new String[3];
        patterns[0] = "*p?qp?*bd*pd?q*";
        patterns[1] = "*qp*d?b*?p?d*";
        patterns[2] = "?*d?b???q*q?p*";

        //ppqpdpbdpdqqppd
        System.out.println(new Safe4().solution(patterns));


        patterns = new String[2];
        patterns[0] = "*z*y*y*z*x*x*x*z*x*z*y*z*y*x*x*y*y*y*z*x*y*z*y*x*x*x*z*x*z*z*z*y*y*z*x*y*z*";
        patterns[1] = "*y*z*z*x*x*y*z*y*z*y*x*z*y*z*y*x*z*y*x*y*x*y*y*z*x*y*z*x*x*z*y*z*y*y*x*z*y*";

        //yzyyzxxxyzyzyxzyzyxzyxyxyyzxyzyxxxzxyzzzyyxzxyz
        System.out.println(new Safe4().solution(patterns));

        patterns = new String[5];
        patterns[0] = "*i*o*?*?*u?*??*e?o*e*a*a*i*ee*";
        patterns[1] = "*e*?ue*o*i*?*e*u*i*?*oa?*??*";
        patterns[2] = "*?oi*i??uu*a*iu*?*?a*u*ia*u*";
        patterns[3] = "*o*e*ea?*eu*?e?*ea*u*u?u*iu*";
        patterns[4] = "*u*?oe*u*e?*e??ou?*oa*?*i*?a?*";

        //eoiueoeaiueuueeeaouuiueoauiaaiuee
        System.out.println(new Safe4().solution(patterns));

        patterns = new String[95];
        patterns[0] = "??????????????????????????";
        patterns[1] = "*a*n*";
        patterns[2] = "*i*x*";
        patterns[3] = "*q*v*";
        patterns[4] = "*u*l*";
        patterns[5] = "*p*n*";
        patterns[6] = "*j*c*";
        patterns[7] = "*j*h*";
        patterns[8] = "*q*h*";
        patterns[9] = "*p*w*";
        patterns[10] = "*p*v*";
        patterns[11] = "*r*s*";
        patterns[12] = "*j*x*";
        patterns[13] = "*i*j*";
        patterns[14] = "*t*h*";
        patterns[15] = "*u*x*";
        patterns[16] = "*i*l*";
        patterns[17] = "*k*s*";
        patterns[18] = "*u*j*";
        patterns[19] = "*p*o*";
        patterns[20] = "*p*r*";
        patterns[21] = "*a*z*";
        patterns[22] = "*t*f*";
        patterns[23] = "*b*c*";
        patterns[24] = "*e*g*";
        patterns[25] = "*l*z*";
        patterns[26] = "*d*c*";
        patterns[27] = "*u*g*";
        patterns[28] = "*l*g*";
        patterns[29] = "*r*z*";
        patterns[30] = "*y*b*";
        patterns[31] = "*g*c*";
        patterns[32] = "*t*h*";
        patterns[33] = "*w*f*";
        patterns[34] = "*i*e*";
        patterns[35] = "*d*g*";
        patterns[36] = "*r*m*";
        patterns[37] = "*e*y*";
        patterns[38] = "*q*n*";
        patterns[39] = "*p*n*";
        patterns[40] = "*y*a*";
        patterns[41] = "*q*n*";
        patterns[42] = "*l*j*";
        patterns[43] = "*n*v*";
        patterns[44] = "*p*b*";
        patterns[45] = "*h*m*";
        patterns[46] = "*r*b*";
        patterns[47] = "*p*i*";
        patterns[48] = "*u*b*";
        patterns[49] = "*e*z*";
        patterns[50] = "*d*b*";
        patterns[51] = "*p*a*";
        patterns[52] = "*s*d*";
        patterns[53] = "*d*z*";
        patterns[54] = "*k*x*";
        patterns[55] = "*o*f*";
        patterns[56] = "*s*g*";
        patterns[57] = "*o*l*";
        patterns[58] = "*t*g*";
        patterns[59] = "*p*v*";
        patterns[60] = "*j*z*";
        patterns[61] = "*y*n*";
        patterns[62] = "*o*b*";
        patterns[63] = "*k*g*";
        patterns[64] = "*i*d*";
        patterns[65] = "*c*v*";
        patterns[66] = "*q*m*";
        patterns[67] = "*e*k*";
        patterns[68] = "*w*j*";
        patterns[69] = "*i*f*";
        patterns[70] = "*i*t*";
        patterns[71] = "*i*b*";
        patterns[72] = "*i*k*";
        patterns[73] = "*p*w*";
        patterns[74] = "*a*x*";
        patterns[75] = "*p*z*";
        patterns[76] = "*r*v*";
        patterns[77] = "*y*c*";
        patterns[78] = "*i*b*";
        patterns[79] = "*n*v*";
        patterns[80] = "*g*v*";
        patterns[81] = "*u*k*";
        patterns[82] = "*w*i*";
        patterns[83] = "*e*f*";
        patterns[84] = "*l*s*";
        patterns[85] = "*t*v*";
        patterns[86] = "*y*d*";
        patterns[87] = "*p*e*";
        patterns[88] = "*h*b*";
        patterns[89] = "*s*f*";
        patterns[90] = "*o*x*";
        patterns[91] = "*i*z*";
        patterns[92] = "*q*e*";
        patterns[93] = "*r*c*";
        patterns[94] = "*k*h*";

        //pqowieurytlaksjdhfgmznxbcv
        System.out.println(new Safe4().solution(patterns));
    }

    class NFA {
        Map<Integer, Map<Character, Set<Integer>>> table;
        int initState;
        int lastState;

        public NFA(Map<Integer, Map<Character, Set<Integer>>> table, int initState, int lastState) {
            this.table = table;
            this.initState = initState;
            this.lastState = lastState;
        }
    }

    /**
     * Create NFA from input; e.g. if input is "?a*b" - here we have alphabet of "a", "b", * - any character zero or more times (represented
     * in function as 'X') and ? (means also any character but has to be there once at least - 'X')
     * <p>
     * so for input "?a*b" this gives table
     * _STATE_|__a___b___X______________
     * 0   |  1   1   1
     * 1   |  2
     * 2   |  2  2,3  2
     * 3   |
     * <p>
     * here, 3 is FINAL state and 0 is start state
     */
    private NFA createNFA(Set<Character> chars, String p) {
        int state = 0;
        boolean wasStar = false;
        Map<Integer, Map<Character, Set<Integer>>> nfa = new HashMap<>();
        for (char c : p.toCharArray()) {
            if (c == '*') {
                Map<Character, Set<Integer>> m1 = nfa.computeIfAbsent(state, k -> new HashMap<>());
                for (char c1 : chars) {
                    m1.computeIfAbsent(c1, k -> new TreeSet<>()).add(state);
                }
                m1.computeIfAbsent('X', k -> new TreeSet<>()).add(state);
                wasStar = true;
                state--;
            } else if (c == '?') {
                if (wasStar) {
                    Map<Character, Set<Integer>> m1 = nfa.get(state);
                    for (char c1 : chars) {
                        m1.computeIfAbsent(c1, k -> new TreeSet<>()).add(state);
                    }
                    m1.computeIfAbsent('X', k -> new TreeSet<>()).add(state);
                }
                Map<Character, Set<Integer>> m1 = nfa.computeIfAbsent(state, k -> new HashMap<>());
                for (char c1 : chars) {
                    m1.computeIfAbsent(c1, k -> new TreeSet<>()).add(state + 1);
                }
                m1.computeIfAbsent('X', k -> new TreeSet<>()).add(state + 1);
                wasStar = false;
            } else {
                if (wasStar) {
                    nfa.get(state).get(c).add(state);
                }
                Map<Character, Set<Integer>> m1 = nfa.computeIfAbsent(state, k -> new HashMap<>());
                m1.computeIfAbsent(c, k -> new TreeSet<>()).add(state + 1);
                wasStar = false;
            }
            state++;
        }
        return new NFA(nfa, 0, state);
    }

    class DFA {
        Map<Integer, Map<Character, Integer>> table;
        int initState;
        Set<Integer> lastStates;

        public DFA(Map<Integer, Map<Character, Integer>> table, int initState, Set<Integer> lastStates) {
            this.table = table;
            this.initState = initState;
            this.lastStates = lastStates;
        }
    }

    /**
     * Convert NFA to DFA (nondeterministic finite automata to deterministic)
     * <p>
     * e.g. for NFA
     * <p>
     * _STATE_|__a___b___X______________
     * 0   |  1   1   1
     * 1   |  2
     * 2   |  2  2,3  2
     * 3   |
     * <p>
     * we convert to DFA
     * <p>
     * _STATE_|__a___b___X______________
     * 0   |  1   1   1
     * 1   |  2
     * 2   |  2   4   2
     * 4   |  2   4   2
     * <p>
     * here, 4 is FINAL state and 0 is start state
     */
    private DFA convertToDFA(Set<Character> chars, NFA nfa) {
        Set<Character> charsWithX = new TreeSet<>(chars);
        charsWithX.add('X');
        Map<Integer, Map<Character, Integer>> dfa = new HashMap<>();
        Queue<Integer> sq = new LinkedList<>();
        int newUsableState = nfa.lastState + 1;
        Set<Integer> lastStates = new TreeSet<>();
        Map<Integer, Set<Integer>> newStateToStates = new HashMap<>();
        Map<Set<Integer>, Integer> statesToNewState = new HashMap<>();
        sq.offer(0);
        Integer currS;
        while ((currS = sq.poll()) != null) {
            Map<Character, Integer> m1 = dfa.computeIfAbsent(currS, k -> new HashMap<>());
            if (currS <= nfa.lastState) {
                //state exists in nfa
                if (nfa.table.get(currS) != null) {
                    for (Map.Entry<Character, Set<Integer>> charToStates : nfa.table.get(currS).entrySet()) {
                        Integer newState;
                        if (charToStates.getValue().size() == 1) {
                            newState = charToStates.getValue().stream().findAny().get();
                            if (newState == nfa.lastState) {
                                lastStates.add(newState);
                            }
                        } else {
                            newState = newUsableState++;
                            newStateToStates.putIfAbsent(newState, charToStates.getValue());
                            statesToNewState.putIfAbsent(charToStates.getValue(), newState);
                            if (charToStates.getValue().contains(nfa.lastState)) {
                                lastStates.add(newState);
                            }
                        }
                        m1.putIfAbsent(charToStates.getKey(), newState);
                        if (!dfa.containsKey(newState) && !sq.contains(newState)) {
                            sq.offer(newState);
                        }
                    }
                } else {
                    //this is final state
                }
            } else {
                //new state
                Set<Integer> li = newStateToStates.get(currS);
                for (Character c : charsWithX) {
                    Set<Integer> states = new TreeSet<>();
                    for (Integer i : li) {
                        Map<Character, Set<Integer>> maybeFinal = nfa.table.get(i);
                        if (maybeFinal != null) {
                            Set<Integer> exisStat = maybeFinal.get(c);
                            if (exisStat != null) {
                                states.addAll(exisStat);
                            }
                        }
                    }
                    if (!states.isEmpty()) {
                        Integer newState;
                        if (states.size() == 1) {
                            newState = states.stream().findAny().get();
                            if (newState == nfa.lastState) {
                                lastStates.add(newState);
                            }
                        } else {
                            newState = statesToNewState.get(states);
                            if (newState == null) {
                                newState = newUsableState++;
                                newStateToStates.putIfAbsent(newState, states);
                                statesToNewState.putIfAbsent(states, newState);
                                if (states.contains(nfa.lastState)) {
                                    lastStates.add(newState);
                                }
                            }
                        }
                        m1.putIfAbsent(c, newState);
                        if (!dfa.containsKey(newState) && !sq.contains(newState)) {
                            sq.offer(newState);
                        }
                    }
                }
            }
        }
        return new DFA(dfa, 0, lastStates);
    }

    private Set<Character> chars(String p) {
        Set<Character> chars = new TreeSet<>();
        for (char c : p.toCharArray()) {
            if (c > 96 && c < 123) {
                chars.add(c);
            }
        }
        return chars;
    }

    class RecNodeMulti {
        final List<Integer> newState;
        final char rec;
        final RecNodeMulti parent;

        public RecNodeMulti(List<Integer> newState, char rec, RecNodeMulti parent) {
            this.newState = newState;
            this.rec = rec;
            this.parent = parent;
        }

        public String toRoot() {
            StringBuilder sb = new StringBuilder();
            RecNodeMulti rn = this;
            while (rn.rec != Character.MIN_VALUE) {
                sb.append(rn.rec);
                rn = rn.parent;
            }
            return sb.reverse().toString();
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;
            RecNodeMulti that = (RecNodeMulti) o;
            return rec == that.rec &&
                    Objects.equals(newState, that.newState);
        }

        @Override
        public int hashCode() {
            return Objects.hash(newState, rec);
        }
    }

    RecNodeMulti root;

    /**
     * Intersects dfa1,dfa2,... dfaN and visit intersecting nodes one by one in BFS manner
     * <p>
     * e.g. for 2 DFAs like dfa1 representing "?a*b"
     * <p>
     * _STATE_|__a___b___X______________
     * 0   |  1   1   1
     * 1   |  2
     * 2   |  2   4   2
     * 4   |  2   4   2
     * <p>
     * and dfa2 representing "a*b*"
     * <p>
     * _STATE_|__a___b___X______________
     * 0   |  1
     * 1   |  1   3   1
     * 3   |  3   3   3
     * <p>
     * we FOLLOW BFS states representing representing "aa*b*"
     * <p>
     * _STATE_|__a___b___X______________
     * 5   |  6
     * 6   |  7
     * 7   |      8   7
     * 8   |      8   9
     * 9   |      8   9
     * <p>
     * where final state is 8
     */
    private String multiIntersectDFAWithBFS(Set<Character> charsSvi, List<DFA> dfaS) {
        Set<Character> charsSviWithX = new TreeSet<>(charsSvi);
        //charsSviWithX.add('X');
        Set<RecNodeMulti> visited = new HashSet<>();
        Queue<RecNodeMulti> sq = new LinkedList<>();
        root = new RecNodeMulti(dfaS.stream().map(x -> x.initState).collect(Collectors.toList()), Character.MIN_VALUE, null);
        sq.offer(root);
        RecNodeMulti currS;
        Map<List<Integer>, Integer> statesToNewState = new HashMap<>();
        int newUsableState = dfaS.stream().mapToInt(x -> x.table.keySet().stream().mapToInt(y -> y).max().getAsInt()).max().getAsInt() + 1;
        while ((currS = sq.poll()) != null) {

            List<Map<Character, Integer>> statemap = new ArrayList<>();
            for (int i = 0; i < dfaS.size(); i++) {
                statemap.add(dfaS.get(i).table.get(currS.newState.get(i)));
            }

            for (char c1 : charsSviWithX) {
                List<Integer> sIns = statemap.stream().map(x -> {
                    Integer in = x.get(c1);
                    if (in == null) {
                        return x.get('X');
                    }
                    return in;
                }).collect(Collectors.toList());

                if (sIns.stream().allMatch(Objects::nonNull)) {
                    Integer exisP = statesToNewState.get(sIns);
                    if (exisP == null) {
                        exisP = newUsableState++;
                        statesToNewState.put(sIns, exisP);
                        boolean allLast = true;
                        for (int i = 0; i < dfaS.size(); i++) {
                            if (!dfaS.get(i).lastStates.contains(sIns.get(i))) {
                                allLast = false;
                                break;
                            }
                        }
                        if (allLast) {
                            //newS3 is LAST STATE!!!!!!
                            return currS.toRoot() + c1;
                        }
                    } else {
                        //already known state; we do not want to go back
                        continue;
                    }
                    RecNodeMulti rnm = new RecNodeMulti(sIns, c1, currS);
                    if (!sq.contains(rnm) && !visited.contains(rnm)) {
                        visited.add(rnm);
                        sq.offer(rnm);
                    }
                }
            }
        }

        return null;
    }

    private String solution(String[] patterns) {

        List<DFA> dfaS = new ArrayList<>();
        Set<Character> charsSvi = new TreeSet<>();

        for (String p : patterns) {
            Set<Character> chars = chars(p);
            charsSvi.addAll(chars);
            NFA nfa = createNFA(chars, p);
            DFA dfa = convertToDFA(chars, nfa);
            dfaS.add(dfa);
        }

        return multiIntersectDFAWithBFS(charsSvi, dfaS);
    }
}

那个95模式示例会导致DFA构建失败,因为部分匹配的可能子集数量与模式数量呈指数关系... - btilly

0

我也找到了自动完成这些操作的库。但是即使是这个库在某些输入方面也存在问题(例如片段中的输入)!一定有一些方法可以针对这些类型的输入进行优化吧?

<dependency>
  <groupId>dk.brics</groupId>
  <artifactId>automaton</artifactId>
  <version>1.12-1</version>
</dependency>

??????????????????????????
*a*n*
*i*x*
*q*v*
*u*l*
*p*n*
*j*c*
*j*h*
*q*h*
*p*w*
*p*v*
*r*s*
*j*x*
*i*j*
*t*h*
*u*x*
*i*l*
*k*s*
*u*j*
*p*o*
*p*r*
*a*z*
*t*f*
*b*c*
*e*g*
*l*z*
*d*c*
*u*g*
*l*g*
*r*z*
*y*b*
*g*c*
*t*h*
*w*f*
*i*e*
*d*g*
*r*m*
*e*y*
*q*n*
*p*n*
*y*a*
*q*n*
*l*j*
*n*v*
*p*b*
*h*m*
*r*b*
*p*i*
*u*b*
*e*z*
*d*b*
*p*a*
*s*d*
*d*z*
*k*x*
*o*f*
*s*g*
*o*l*
*t*g*
*p*v*
*j*z*
*y*n*
*o*b*
*k*g*
*i*d*
*c*v*
*q*m*
*e*k*
*w*j*
*i*f*
*i*t*
*i*b*
*i*k*
*p*w*
*a*x*
*p*z*
*r*v*
*y*c*
*i*b*
*n*v*
*g*v*
*u*k*
*w*i*
*e*f*
*l*s*
*t*v*
*y*d*
*p*e*
*h*b*
*s*f*
*o*x*
*i*z*
*q*e*
*r*c*
*k*h*

import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;

public class Safe {
  public static void main(String[] args) {
    String[] patterns = new String[3];
    patterns[0] = "?a*b";
    patterns[1] = "a*b*";
    patterns[2] = "*a??a*";

    //aaaab
    System.out.println(new Safe().solution(patterns));

    patterns = new String[3];
    patterns[0] = "*a*b*";
    patterns[1] = "*c*d*";
    patterns[2] = "*e*f*";

    //abcdef
    System.out.println(new Safe().solution(patterns));

    patterns = new String[3];
    patterns[0] = "*p?qp?*bd*pd?q*";
    patterns[1] = "*qp*d?b*?p?d*";
    patterns[2] = "?*d?b???q*q?p*";

    //p?qpd?bdpdqqppd
    //paqpdabdpdqqppd
    System.out.println(new Safe().solution(patterns));

    patterns = new String[2];
    patterns[0] = "*z*y*y*z*x*x*x*z*x*z*y*z*y*x*x*y*y*y*z*x*y*z*y*x*x*x*z*x*z*z*z*y*y*z*x*y*z*";
    patterns[1] = "*y*z*z*x*x*y*z*y*z*y*x*z*y*z*y*x*z*y*x*y*x*y*y*z*x*y*z*x*x*z*y*z*y*y*x*z*y*";

    //yzyyzxxxyzyzyxzyzyxzyxyxyyzxyzyxxxzxyzzzyyxzxyz
    System.out.println(new Safe().solution(patterns));
  }

  public String solution(String[] patterns){
    Automaton a = null;
    for(String pattern : patterns){
      RegExp r = convertToRegExp(pattern);
      if(a == null) {
        a = r.toAutomaton();
      }
      else{
        a = a.intersection(r.toAutomaton());
      }
    }
    return a.getShortestExample(true);
  }

  private RegExp convertToRegExp(String pattern){
    return new RegExp(replaceQuestionMarks(replaceStars(pattern)).trim());
  }

  private String replaceStars(String pattern){
    return pattern.replaceAll("\\*+", "[a-z]*");
  }

  private String replaceQuestionMarks(String pattern){
    return pattern.replace("?", "[a-z]{1}");
  }
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接