以下对DOMParser的使用为什么会导致输出的HTML与输入的不同呢?它会移除DOCTYPE和顶级元素之间的空格,移除文档元素和
head
之间的空格,并在</body>
前添加一个换行符。
I have tested this in Google Chrome, Firefox, and Safari; I also ran the analogous code with JSoup and got exactly the same results. So I'm pretty sure it's not a bug. My current theory is that this is caused by some sort of esoteric parsing rule from a specification somewhere. But there could be other things I misunderstand.
const html = `<!DOCTYPE html>
<html>
<head>
<title>1</title>
</head>
<body>
<div>
Hello, World!
</div>
</body>
</html>`;
const setText = function(id,string) {
document.getElementById(id).appendChild(document.createTextNode(string));
};
const documentToString = function(d) {
return Array.prototype.slice.call(d.childNodes).map(function(node) {
if (node.nodeType == node.ELEMENT_NODE) return node.outerHTML;
if (node.nodeType == node.DOCUMENT_TYPE_NODE) return new XMLSerializer().serializeToString(node);
throw new TypeError("" + node);
}).join("");
};
setText("raw", html);
var parsed = new DOMParser().parseFromString(html,"text/html");
setText("parsed", parsed.documentElement.outerHTML);
setText("converted", documentToString(parsed));
setText("xmlserializer", new XMLSerializer().serializeToString(parsed));
#raw, #parsed, #converted, #xmlserializer { white-space: pre; font-family: monospace; }
h1 { font-size: 110%; font-weight: bold; font-family: sans-serif; }
<body>
<h1>Raw string</h1>
<div id="raw"></div>
<h1>Parsed top-level element</h1>
<div id="parsed"></div>
<h1>Using a document-to-string converter</h1>
<div id="converted"></div>
<h1>From XMLSerializer</h1>
<div id="xmlserializer"></div>
</body>