我是
Debuggex的创建者,其要求与您的非常相似:优化可显示的信息量。
以下是Debuggex使用的解析器的大幅修改(为了可读性)。它不能直接使用,但旨在展示代码的组织结构。大多数错误处理已被删除。许多直观但冗长的逻辑也被删除。
请注意,使用了
递归下降。这就是您在解析器中所做的,只是您的解析器被压缩成单个函数。我使用了大致这个语法:
Regex -> Alt
Alt -> Cat ('|' Cat)*
Cat -> Empty | (Repeat)+
Repeat -> Base (('*' | '+' | '?' | CustomRepeatAmount) '?'?)
Base -> '(' Alt ')' | Charset | Literal
Charset -> '[' (Char | Range | EscapeSeq)* ']'
Literal -> Char | EscapeSeq
CustomRepeatAmount -> '{' Number (',' Number)? '}'
您会注意到我的很多代码都只是处理javascript正则表达式的特殊性质。您可以在
这个参考链接中找到更多相关信息。对于PHP,
这个链接包含了您需要的所有信息。我认为您的解析器已经走得很远了;现在只需实现其余的运算符并正确处理边缘情况即可。
:) 祝您愉快:
var Parser = function(s) {
this.s = s;
this.k = 0;
this.group = 1;
};
Parser.prototype.more = function() {
return this.k < this.s.length;
};
Parser.prototype.peek = function() {
};
Parser.prototype.next = function() {
};
Parser.prototype.eat = function(c) {
};
Parser.prototype.parseRe = function() {
return new ReTree(this.parseAlt());
if (this.more()) {
throw new Error();
}
};
Parser.prototype.parseAlt = function() {
var alts = [this.parseCat()];
while (this.more() && this.peek() === '|') {
this.next();
alts.push(this.parseCat());
}
return new AltTree(alts);
};
Parser.prototype.parseCat = function() {
var cats = [];
while (this.more() && ')|'.indexOf(this.peek()) === -1) {
cats.push(this.parseRepeat());
}
return (cats.length >= 1) ? new CatTree(cats) : new EmptyTree();
};
Parser.prototype.parseRepeat = function() {
var repeat = this.parseBase();
if (!this.more() || '*?+{'.indexOf(this.peek()) === -1) {
return repeat;
}
var min = 0; var max = Infinity; var greedy = true;
if (this.peek() === '*') {
} else if (this.peek() === '?') {
} else if (this.peek() === '+') {
this.next(); min = 1;
} else if (this.peek() === '{') { }
if (this.more() && this.peek() === '?') {
this.next(); greedy = false;
}
return new RepeatTree(repeat, {min:min, max:max, greedy:greedy});
};
Parser.prototype.parseBase = function() {
var c = this.peek();
if (c === ')' || '*?+'.indexOf(c) !== -1) {
throw new Error();
}
if (c === '(') {
this.next();
var ret = null;
if (this.peek() === '?') {
} else {
var group = this.group++;
ret = new MatchTree(this.parseAlt(), group);
}
this.eat(')');
return ret;
} else if (c === '[') {
this.next();
var opts = this.parseCharset();
this.eat(']');
return new CharsetTree(opts);
} else {
var literal = this.parseLiteral();
return new LiteralTree(literal);
}
};
Parser.prototype.parseCharset = function() {
};
Parser.prototype.parseLiteral = function() {
var c = this.next();
if (c === '.' || c === '^' || c === '$') {
return new CharInfo(c, true);
} else if (c === '\\') {
return this.parseEscaped({inCharset: false});
}
return new CharInfo(c);
};
Parser.prototype.parseEscaped = function(opts) {
opts = opts || {};
inCharset = opts.inCharset || false;
var c = peek();
if (c === 'c') {
} else if (c === '0') {
} else if (isDigit(c)) {
} else if (c === 'x') {
} else if (c === 'u') {
var r = '';
this.next();
for (var i = 0; i < 4; ++i) {
c = peek();
if (!isHexa(c)) {
throw new Error();
}
r += c;
this.next();
}
return new CharInfo(String.fromCharCode(parseInt(r, 16)));
} else {
this.next();
if (inCharset && c === 'b') {
return new CharInfo('\b');
} else if (!inCharset && (c === 'b' || c === 'B')) {
return new CharInfo('\\' + c, true);
} else if (c === 'f') {
} else if (c === 'n') {
} else if (c === 'r') {
} else if (c === 't') {
} else if (c === 'v') {
} else if ('dDsSwW'.indexOf(c) !== -1) {
} else {
return new CharInfo(c);
}
}
};
var CharInfo = function(c, special) {
this.c = c;
this.special = special || false;
};
var parse = function(s) { return (new Parser(s)).parseRe(); };