正则表达式:将XML转换为JSON

3
这是之前一个问题的延续: 我需要在Parse.com的Cloud Code中使用JavaScript将XML转换为Json 请不要因为您认为RegEx不是正确的选择而对此进行负面评价。这是我所必须处理的内容。如果您有其他想法,请告诉我。但它必须在Parse.com的Cloud Code上运行。
原始XML:
<?xml version="1.0" encoding="UTF-8" ?><api><products total-matched="1618" records-returned="1" page-number="1"><product><ad-id>1234</ad-id><supplier-name>Window World</supplier-name><supplier-category>3703703</supplier-category><buy-url>http://website.com</buy-url><currency>USD</currency><description>Window</description><image-url>http://website.com/windowa/80x80.jpg</image-url><in-stock>yes</in-stock><manufacturer-name>Window World</manufacturer-name><name>Half Pain Glass</name><price>31.95</price><retail-price>87.60</retail-price><sale-price>29.95</sale-price><sku>5938</sku><upc></upc></product><product><ad-id>1234</ad-id><supplier-name>Window World</supplier-name><supplier-category>3703703</supplier-category><buy-url>http://website.com</buy-url><currency>USD</currency><description>Window</description><image-url>http://website.com/windowa/80x80.jpg</image-url><in-stock>yes</in-stock><manufacturer-name>Window World</manufacturer-name><name>Half Pain Glass</name><price>31.95</price><retail-price>87.60</retail-price><sale-price>29.95</sale-price><sku>5938</sku><upc></upc></product><product><ad-id>1234</ad-id><supplier-name>Window World</supplier-name><supplier-category>3703703</supplier-category><buy-url>http://website.com</buy-url><currency>USD</currency><description>Window</description><image-url>http://website.com/windowa/80x80.jpg</image-url><in-stock>yes</in-stock><manufacturer-name>Window World</manufacturer-name><name>Half Pain Glass</name><price>31.95</price><retail-price>87.60</retail-price><sale-price>29.95</sale-price><sku>5938</sku><upc></upc></product><product><ad-id>1234</ad-id><supplier-name>Window World</supplier-name><supplier-category>3703703</supplier-category><buy-url>http://website.com</buy-url><currency>USD</currency><description>Window</description><image-url>http://website.com/windowa/80x80.jpg</image-url><in-stock>yes</in-stock><manufacturer-name>Window World</manufacturer-name><name>Half Pain Glass</name><price>31.95</price><retail-price>87.60</retail-price><sale-price>29.95</sale-price><sku>5938</sku><upc></upc></product></products></api>

正则表达式代码:

var regex = /(<\w+[^<]*?)\s+([\w-]+)="([^"]+)">/;
            while(xml.match(regex)) xml = xml.replace(regex, '<$2>$3</$2>$1>'); // For attributes

            xml = xml.replace(/\s/g, ' ').  // Finds all the white space converts to single space
                    replace(/< *\?[^>]*?\? *>/g, ''). //Finds the XML header and removes it
                    replace(/< *!--[^>]*?-- *>/g, ''). //Finds and removes all comments
                    replace(/< *(\/?) *(\w[\w-]+\b):(\w[\w-]+\b)/g, '<$1$2_$3').
                    replace(/< *(\w[\w-]+\b)([^>]*?)\/ *>/g, '< $1$2>').
                    replace(/(\w[\w-]+\b):(\w[\w-]+\b) *= *"([^>]*?)"/g, '$1_$2="$3"').
                    replace(/< *(\w[\w-]+\b)((?: *\w[\w-]+ *= *" *[^"]*?")+ *)>( *[^< ]*?\b.*?)< *\/ *\1 *>/g, '< $1$2 value="$3">').
                    //replace(/ *(\w[\w-]+\b) *= *"([^>]*?)" */g, '< $1>$2').
                    replace(/< *(\w[\w-]+\b) *</g, '<$1>< ').
                    replace(/> *>/g, '>').
                    //replace(/< *\/ *(\w[\w-]+\b) *> *< *\1 *>/g, '').  // breaks the output?
                    replace(/"/g, '\\"').
                    replace(/< *(\w[\w-]+\b) *>([^<>]*?)< *\/ *\1 *>/g, '"$1":"$2",').
                    replace(/< *(\w[\w-]+\b) *>([^<>]*?)< *\/ *\1 *>/g, '"$1":{$2},').
                    replace(/< *(\w[\w-]+\b) *>(?=.*?< \/\1\},\{)/g, '"$1":[{').
                    split(/\},\{/).
                    reverse().
                    join('},{').
                    replace(/< *\/ *(\w[\w-]+\b) *>(?=.*?"\1":\[\{)/g, '}],').
                    split(/\},\{/).
                    reverse().
                    join('},{').
                    replace(/< \/(\w[\w-]+\b)\},\{\1>/g, '},{').
                    replace(/< *(\w[\w-]+\b)[^>]*?>/g, '"$1":{').
                    replace(/< *\/ *\w[\w-]+ *>/g,'},').
                    replace(/\} *,(?= *(\}|\]))/g, '}').
                    replace(/] *,(?= *(\}|\]))/g, ']').
                    replace(/" *,(?= *(\}|\]))/g, '"').
                    replace(/ *, *$/g, '');

输出:

"api": {
    "page-number": "1",
    "records-returned": "1",
    "total-matched": "1618",
    "products": {
        "product": {
            "ad-id": "1234",
            "supplier-name": "Window World",
            "supplier-category": "3703703",
            "buy-url": "http://website.com",
            "currency": "USD",
            "description": "Window",
            "image-url": "http://website.com/windowa/80x80.jpg",
            "in-stock": "yes",
            "manufacturer-name": "Window World",
            "name": "Half Pain Glass",
            "price": "31.95",
            "retail-price": "87.60",
            "sale-price": "29.95",
            "sku": "5938",
            "upc": ""
        },
        "product": {
            "ad-id": "1234",
            "supplier-name": "Window World",
            "supplier-category": "3703703",
            "buy-url": "http://website.com",
            "currency": "USD",
            "description": "Window",
            "image-url": "http://website.com/windowa/80x80.jpg",
            "in-stock": "yes",
            "manufacturer-name": "Window World",
            "name": "Half Pain Glass",
            "price": "31.95",
            "retail-price": "87.60",
            "sale-price": "29.95",
            "sku": "5938",
            "upc": ""
        },
        "product": {
            "ad-id": "1234",
            "supplier-name": "Window World",
            "supplier-category": "3703703",
            "buy-url": "http://website.com",
            "currency": "USD",
            "description": "Window",
            "image-url": "http://website.com/windowa/80x80.jpg",
            "in-stock": "yes",
            "manufacturer-name": "Window World",
            "name": "Half Pain Glass",
            "price": "31.95",
            "retail-price": "87.60",
            "sale-price": "29.95",
            "sku": "5938",
            "upc": ""
        },
        "product": {
            "ad-id": "1234",
            "supplier-name": "Window World",
            "supplier-category": "3703703",
            "buy-url": "http://website.com",
            "currency": "USD",
            "description": "Window",
            "image-url": "http://website.com/windowa/80x80.jpg",
            "in-stock": "yes",
            "manufacturer-name": "Window World",
            "name": "Half Pain Glass",
            "price": "31.95",
            "retail-price": "87.60",
            "sale-price": "29.95",
            "sku": "5938",
            "upc": ""
        }
    }
}

我目前遇到的最后一个问题是它无法将重复的项目转换为JSON数组。有任何解决方法吗?

如果你能从正则表达式的方法转向Javascript,那么可以看一下http://davidwalsh.name/convert-xml-json。 - Parthik Gosar
我一开始尝试了那个,但它在Parse.com的云上不起作用。使用这个正则表达式后,情况有所改善。谢谢。 - Brad
1
你肯定没有足够的实力来应对Parse.com API变更时的代码修改。而且,任何人都不会想要调试这段代码。最好的方法是将你的XML转换为JavaScript对象,然后再将对象转换为JSON格式。这样至少可以进行可调试的步骤。 - vinczemarton
你调查过为什么其他将XML转换为JSON的解决方案无法在Parse.com的云上工作吗? - Benjamin Toueg
好的,我想我们必须解决这个问题。恐怕可能需要一些时间。 - Loamhoof
显示剩余9条评论
2个回答

2
使用正则表达式是一种有趣的方法,似乎比使用节点列表更快。然而,在速度不是决定性因素时(如OP的应用程序),这不是将xml转换为js的最佳方法。正则表达式代码约为1kb压缩。对于相同数量的字节,您可以构建一个相当强大且可重用的转换器...甚至可以处理不同浏览器中的xml名称空间。
我编写了以下代码(压缩),并且它可以很好地处理OP的XML数据。
    var xml2js=function(m,p){var f=1,o=2,d=3,n=4,j=7,c=8,h=9,l,b,a,k={},g=[];if(!p){p={}}if(typeof p=="string"){p={find:p}}p.xmlns=p.xmlns||"*";if(p.parse!="function"){p.parse=e}function e(i){return i.split(":").pop().replace(/^ows_/,"").replace(/[^a-z,A-Z,0-9]/g,"")}switch(m.nodeType){case h:a=(!p.find)?m.childNodes:(m.getElementsByTagNameNS)?m.getElementsByTagNameNS(p.xmlns,p.find.split(":").pop()):m.getElementsByTagName(p.find);for(l=0;l<a.length;l++){k=xml2js(a[l]);if(k){g.push(k)}}k=(g.length&&g.length==1)?g[0]:g;break;case f:if(m.attributes.length==0&&m.childNodes.length==1&&m.childNodes.item(0).nodeValue){k=m.childNodes.item(0).nodeValue}for(l=0;l<m.attributes.length;l++){b=p.parse(m.attributes.item(l).nodeName);k[b]=m.attributes.item(l).nodeValue}for(l=0;l<m.childNodes.length;l++){if(m.childNodes.item(l).nodeType!=d){b=p.parse(m.childNodes.item(l).nodeName);if(typeof k[b]=="undefined"){k[b]=xml2js(m.childNodes.item(l))}else{if(typeof k[b].push=="undefined"){k[b]=[k[b]]}k[b].push(xml2js(m.childNodes.item(l)))}}}break;case n:k="<![CDATA["+m.nodeValue+"]]>";break;case d:k=m.nodeValue;break;case c:k="";break;default:k=null}return k};

然后加载并转换XML:

    function test( ) {
        var nodeName = 'products'; // optional - any node name
        var xhr = new XMLHttpRequest();
        xhr.open('GET', 'CloudCode.xml', false);
        xhr.send();
        var js = xml2js( xhr.responseXML, nodeName );
        console.log(JSON.stringify( js, null, '\t'));
    }

输出:

    {
    "pagenumber": "1",
    "recordsreturned": "1",
    "totalmatched": "1618",
     "product": [
      {
        "adid": "1234",
        "suppliername": "Window World",
        "suppliercategory": "3703703",
         "buyurl": "http://website.com",


         etc...

2

好的,需要注意的是这只是一个快速修复方法,但似乎有效。这将只是添加一个数组结构,以便您不会多次使用相同的键(但不会破坏该键)。
更改为:

replace(/< *(\w[\w-]+\b) *>(?=.*?< \/\1\},\{)/g, '"$1":[{').
split(/\},\{/).
reverse().
join('},{').
replace(/< *\/ *(\w[\w-]+\b) *>(?=.*?"\1":\[\{)/g, '}],').
split(/\},\{/).
reverse().
join('},{').

这是一种实现数组的尝试。
并将其放置在:

replace(/< *(\w[\w-]+\b) *>(?=("\w[\w-]+\b)":\{.*?\},\2)(.*?)< *\/ *\1 *>/, '"$1":[$3],')

注意,我基本上使用了他的匹配方式。至少在你的示例中似乎可以工作。

我必须在你的代码中添加.replace(/],\s*?".*?": *[/g,','),以使其通过JSON格式化程序jsonformatter.curiousconcept.com,然后将整个内容括在{}大括号中。但现在似乎一切都正常了。再次感谢你的帮助!(产品出现多次) - Brad
不过,整个东西还是相当有 bug 的,如果我想到更好的方法,我会告诉你的。 - Loamhoof

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接