JavaScript - string.split(regex)保留分隔符

3
我想使用正则表达式拆分字符串,并将分隔符/匹配的信息包含在生成的数组中。
在Java中,我使用了:
theString.split("(?<=[!><=}{])|(?=[!><=}{])|(?<= AND )|(?= AND )|(?<= OR )|(?= OR )")

但是,JavaScript 不支持正则表达式的后顾断言 ?<=
例如,我想要匹配字符串:
"Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes"

分割:
Reason,=,{,Existing problem, or ,fault,},{,Bestaande probleem of vout,},{,Other,},{,Ander,}, and ,Required,!,=,No, and ,Results,>,=,10, and ,Results,<,=,25, and ,Tst,>,5, and ,Tst,<,80, and ,Info,=,test this, or ,that, and ,those, and ,Success,!,=,Yes

我得到的例子是:
var thestr = "Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes";

document.write("::SPLIT::<br>");
var patt1=new RegExp(/([!><=}{])|( AND )|( OR ) /gi);

var x = thestr.split(patt1);
//This splits correctly but, doesn't include the separators / matched characters
document.write("length="+x.length+"<br>");
for (c=0;c<x.length;c++) {
    document.write(c+" - "+ x[c]+" |");
}

document.write("<br><br>::MATCH::<br>");

var y = thestr.match(patt1);

//This shows the matched characters but, how do I combine info from split and match
document.write("length="+y.length+"<br>");
for (d=0;d<y.length;d++) {
    document.write(d+" - "+ y[d]+" |");
}

document.write("<br><br>::INCLUDE SEPERATORS::<br>");
var patt2=new RegExp(/(?![!><=}{])|(?=[!><=}{])|(?! AND )|(?= AND )|(?! OR )|(?= OR ) /gi);
//This puts everything in the array, but, each character is a seperate array element.
// Not what I wanted to achieve.
var bits = thestr.split(patt2);
document.write("length="+bits.length+"<br>");
for (r=0;r<bits.length;r++) {
    document.write(r+" - "+ bits[r]+" |");
}

所以你基本上想要在 , , 以及任何两个字符之间拆分,除了字母数字字符或空格之间? - Tim Pietzcker
6个回答

5

如果您将整个模式放在一个组中,您也会得到分隔符:

thestr.split(/([!><=}{]| (?:AND|OR) )/)

这会返回一个类似于以下的数组:
["Reason", "=", "", "{", "Existing problem or fault", "}", "", "{", "Bestaande probleem of vout", "}", "", "{", "Other", "}", "", "{", "Ander", "}", " and Required", "!", "", "=", "No and Results ", ">", "", "=", "10 and Results ", "<", "", "=", "25 and Tst", ">", "5 and Tst", "<", "80 and Info", "=", "test this or that and those and Success", "!", "", "=", "Yes"]

然后,您只需要过滤掉空字符串,就完成了:
thestr.split(/([!><=}{]| (?:AND|OR) )/).filter(Boolean)

编辑    由于Internet Explorer和其他一些浏览器可能不会将组合分隔符包含在结果数组中,因此你可以使用以下方法代替:

var matches = thestr.split(/(?:[!><=}{]| (?:AND|OR) )/),
    separators = thestr.match(/(?:[!><=}{]| (?:AND|OR) )/g);
for (var i=0; i<separators.length; ++i) {
    matches[i+1] = separators[i];
}

这基本上将分隔符与其他部分分开,然后将两者结合起来。

很不幸(对我们所有人来说),Internet Explorer不支持从*split()filter()*方法中保存捕获的组,因此这不是一个开箱即用的跨浏览器解决方案。 - Andy E
1
这里还有Steven Levithan的跨浏览器分割函数,它更加符合规范。 - Andy E

2

不深入探讨查询结构,我建议您使用replace方法,并使用函数作为替换,这将收集术语到一个数组中:

function parse(sQuery) {
    var aParsed = [];
    var oReTerms = /.../gim;
    sQuery.replace(oReTerms, function($0, $1, $2, ...) {
        //...
        if ($1) {
            aParsed.append($1);
        }
        if ($2) {
            aParsed.append($2);
        }
        //...
        return $0; // return what was matched (or any string)
    });
    return aParsed;
}

我之前做过解析HTML标签和属性的工作。希望这个想法很清晰。你只需要定义一个正则表达式,使其匹配查询中的所有术语。
并且你可以在替换函数中针对特定情况进行另一次替换。

这种技术的更多信息可以在John Resig的博客上找到 - http://ejohn.org/blog/search-and-dont-replace/. - Andy E

1

我不确定JavaScript在正则表达式分割中包含捕获组时的行为。我知道在Python中,如果拆分分隔符被包含在捕获括号中,它将成为匹配的一部分。

尝试一下

result = subject.split(/( or )|( and )|([^\w\s])\b|(?=[^\w\s])/i);

看看会发生什么。


1
function split2(str, re) {
    if (re.global) {
        // Reset to start of string
        re.lastIndex = 0;
    }
    var result = [];
    var match = re.exec(str);
    var lastEnd = 0;
    while (match != null) {
        if (match.index > lastEnd) {
            result.push(str.substring(lastEnd, match.index));
        }
        result.push(match[0]);
        lastEnd = match.index + match[0].length;
        match = re.exec(str);
    }
    result.push(str.substring(lastEnd));
    return result;
}

var thestr = "Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes";

var patt = /[!><=}{]| AND | OR /gi;

split2(thestr,patt):

输出:

["Reason", "=", "{", "Existing problem", " or ", "fault", "}", "{",
"Bestaande probleem of vout", "}", "{", "Other", "}", "{", "Ander", "}", " and ",
"Required", "!", "=", "No", " and ", "Results ", ">", "=", "10", " and ",
"Results ", "<", "=", "25", " and ", "Tst", ">", "5", " and ", "Tst", "<", "80",
" and ", "Info", "=", "test this", " or ", "that", " and ", "those", " and ",
"Success", "!", "=", "Yes"]

1
Gumbo的分割函数是个好主意,但它不起作用。应该这样写:
function split(str, regex) {
    var matches    = str.split(regex),
        separators = str.match(regex),
        ret        = [ matches[0] ];
    if (!separators) return ret;
    for (var i = 0; i < separators.length; ++i) {
        ret[2 * i + 1] = separators[i];
        ret[2 * i + 2] = matches[i + 1];
    }
    return ret;
}

split('a,b,c', /,/g); // returns ["a", ",", "b", ",", "c"]

0
为了支持大多数使用的浏览器,你可以匹配你的字符串。 此模式匹配除分隔符<>!{}=或其中一个分隔符以外的任意数量字符。
var rx=/([^<>!{}=]+|[<>!{}=])/g

var str='Reason={Existing problem or fault}{Bestaande probleem of vout}'+
'{Other}{Ander} and Required!=No and Results >=10 and Results <=25 '+
'and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes';


str.match(rx).join('\n')

//returned value:
Reason
=
{
Existing problem or fault
}
{
Bestaande probleem of vout
}
{
Other
}
{
Ander
}
 and Required
!
=
No and Results 
>
=
10 and Results 
<
=
25 and Tst
>
5 and Tst
<
80 and Info
=
test this or that and those and Success
!
=
Yes

// 我将字符串连接起来,然后将结果拼接以便阅读


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接