能否使用运算符优先级解析正则表达式呢?
https://en.wikipedia.org/wiki/Operator-precedence_parser
如果不能,那么在这里例如使用了什么解析器?
https://en.wikipedia.org/wiki/Thompson's_construction#Application_of_the_algorithm
此致敬礼
能否使用运算符优先级解析正则表达式呢?
https://en.wikipedia.org/wiki/Operator-precedence_parser
如果不能,那么在这里例如使用了什么解析器?
https://en.wikipedia.org/wiki/Thompson's_construction#Application_of_the_algorithm
此致敬礼
<RE> ::= <union> | <simple-RE>
<union> ::= <RE> "|" <simple-RE>
<simple-RE> ::= <concatenation> | <basic-RE>
<concatenation> ::= <simple-RE> <basic-RE>
<basic-RE> ::= <star> | <plus> | <elementary-RE>
<star> ::= <elementary-RE> "*"
<plus> ::= <elementary-RE> "+"
<elementary-RE> ::= <group> | <any> | <char>
<group> ::= "(" <RE> ")"
<any> ::= "."
<char> ::= any non metacharacter | "\" metacharacter
<concatenation>
右侧有两个相邻的非终结符,这意味着这不是一个运算符优先级语法。正则表达式确实可以使用运算符优先级解析器进行解析。提供的C语言代码基本上采用了普拉特(操作符优先级)解析器方法。该程序打印出一系列状态转换表示的NFA(非确定有限自动机)。输入是带有通常的*?()+。[] 运算符的正则表达式。普拉特解析器函数parse()首先为正则表达式创建一个表达式树。它将两个连续的字符(或括号分组)视为它们之间存在一个不可见的二元串联运算符。在此之后,函数print_nfa()从右向左遍历表达式树,并即时打印NFA状态转换。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_PREC 10
#define MAX_CHAR 255 // code for chars from 0 to incl. MAX_CHAR
enum Type { CHAR = MAX_CHAR+1, CONCAT, ANY, SET };
typedef unsigned short IType;
struct Node {
IType type, ch, state;
struct Node *left, *right;
char charset[1];
};
static char *regex;
static IType state;
static struct Node *parse(int prec, struct Node *left);
static int next_token(void) { return *regex ? *regex++ : 0; }
struct Node *allocnode(IType type, IType ch, int extra)
{
struct Node *node = calloc(1, sizeof(struct Node) + extra);
node->type = type;
node->ch = ch;
node->state = state++;
return node;
}
static int getprec(int op) // get the precedence level of op (highest precedence is 0)
{
switch (op) {
case '[': case '(': return 0; // brackets and parenthesis are like literals: highest precedence
case '*': case '+': case '?': return 1; // postfix operators
case CONCAT: default: return 2; // concat
case '|': return 3; // alternate operator is lowest precedence
case ')': return 4; // closing parenthesis is even lower
}
}
static struct Node *charset(void)
{
struct Node *node;
int i;
node = allocnode(SET, 0, 1024);
// scan until end of set
for (i=0; regex[0] != ']' && regex[0] != '\0'; ++regex, ++i)
node->charset[i] = regex[0];
node->charset[i] = '\0';
if (regex[0] == ']') // eat terminating ]
++regex;
return node;
}
static struct Node *paren(struct Node *left)
{
struct Node *node = parse(getprec(')'), left);
if (regex[0] == ')')
++regex;
return node;
}
static struct Node *single(void)
{
if (regex[-1] == '.')
return allocnode(ANY, 0, 0);
else
return allocnode(CHAR, regex[-1], 0);
}
static struct Node *postfix(int tok, struct Node *left)
{
struct Node *node = allocnode((IType) tok, 0, 0);
node->left = left;
return node;
}
static struct Node *binary(int tok, struct Node *left)
{
struct Node *node = allocnode((IType) tok, 0, 0);
node->left = left;
node->right = parse(getprec(tok) + 1, NULL);
return node;
}
static struct Node *parse(int prec, struct Node *left)
{
int tok = next_token();
// handle prefix
switch (tok) {
case '[': left = charset(); break;
case '(': left = paren(NULL); break;
case '*': case '+': case '?': break;
case '|': break;
case '\0': return left;
default: left = single(); break;
}
// handle infix and postfix operators
while (*regex && getprec(regex[0]) < prec) {
tok = next_token();
switch (tok) {
case '*': case '+': case '?': left = postfix(tok, left); break;
case '|': left = binary('|', left); break;
// concat is like an invisible infix operator (that's why we -- here)
default: --regex; left = binary(CONCAT, left); break;
}
}
return left;
}
static IType print_nfa(struct Node *root, IType final)
{
IType temp;
if (root == NULL)
return 0;
switch (root->type) {
case CONCAT: // build two nfa's that link left-->right
return print_nfa(root->left, print_nfa(root->right, final));
case '|':
// build two nfa's that both link to final: left-->final and right-->final
// create two epsilon transitions to the two nfa's
printf("(%d, ε, %d)\n", root->state, print_nfa(root->left, final));
printf("(%d, ε, %d)\n", root->state, print_nfa(root->right, final));
return root->state;
case '*':
case '+':
case '?':
// build an nfa that links to a new state (root): left-->root (or final in case of ?)
temp = print_nfa(root->left, (IType) (root->type=='?'?final:root->state));
// create two epsilon transitions one to the nfa above and one to final
printf("(%d, ε, %d)\n", root->state, temp);
printf("(%d, ε, %d)\n", root->state, final);
if (root->type == '+')
return temp; // return the start state of the nfa as the start state into this
else
return root->state; // return root as the start state into this
case CHAR: // create a transition from this state (root) to final
printf("(%d, %d, %d)\n", root->state, root->ch, final);
return root->state;
case ANY:
printf("(%d, ANY, %d)\n", root->state, final);
return root->state;
case SET:
printf("(%d, [%s], %d)\n", root->state, root->charset, final);
return root->state;
default: break;
}
return 0;
}
int main(int argc, char *argv[])
{
struct Node *root;
IType final, start;
if (argc < 2) {
printf("usage: %s regex\n", argv[0]);
return 0;
}
regex = argv[1];
root = parse(MAX_PREC, NULL);
start = print_nfa(root, final = state++);
printf("start = %d, final = %d\n", start, final);
return 0;
}
这种方法结合了两种算法:Vaughan Pratt的自顶向下运算符优先级解析方案,如Nystrom的书《Crafting Interpreters》第17章所述,以及Thompson NFA算法,如启发性文章“Regular Expression Matching Can Be Simple And Fast”(https://swtch.com/~rsc/regexp/regexp1.html)中所介绍的。由于我没有在一个程序中找到这两种方法的组合,所以我决定分享我想出来的东西,即使原始问题已经有5年了。