如果您想要一个干净的文本表示形式,我建议使用
lynx(由Project Gutenberg使用)或
pandoc。这两个工具都可以安装并通过
spawn
从节点调用。它们将提供比运行puppeteer并使用textContent或innerText更清晰的文本表示形式。
您还可以尝试遍历DOM并根据节点类型添加换行符。
import "./styles.css";
import cheerio from "cheerio";
const NODE_TYPES = {
TEXT: "text",
ELEMENT: "tag"
};
const INLINE_ELEMENTS = [
"a",
"abbr",
"acronym",
"audio",
"b",
"bdi",
"bdo",
"big",
"br",
"button",
"canvas",
"cite",
"code",
"data",
"datalist",
"del",
"dfn",
"em",
"embed",
"i",
"iframe",
"img",
"input",
"ins",
"kbd",
"label",
"map",
"mark",
"meter",
"noscript",
"object",
"output",
"picture",
"progress",
"q",
"ruby",
"s",
"samp",
"script",
"select",
"slot",
"small",
"span",
"strong",
"sub",
"sup",
"svg",
"template",
"textarea",
"time",
"u",
"tt",
"var",
"video",
"wbr"
];
const content = `
<div>
By March
<div>
<h2 class="authorh2">John Smith</h2>
<div>line1</div>line2
line3
<ul>
<li>test</li>
<li>test2</li>
<li>test3</li>
</ul>
</div>
</div>
`;
const isInline = (element) => INLINE_ELEMENTS.includes(element.name);
const isBlock = (element) => isInline(element) === false;
const walkTree = (node, callback, index = 0, level = 0) => {
callback(node, index, level);
for (let i = 0; i < (node.children || []).length; i++) {
walkTree(node.children[i], callback, i, ++level);
level--;
}
};
const docFragText = [];
const cheerioFn = cheerio.load(content);
const docFrag = cheerioFn("body")[0];
walkTree(docFrag, (element) => {
if (element.name === "body") {
return;
}
if (element.type === NODE_TYPES.TEXT) {
const parentElement = element.parent || {};
const previousElement = element.prev || {};
let textContent = element.data
.split("\n")
.map((nodeText, index) => (/\w/.test(nodeText) ? nodeText + "\n" : ""))
.join("");
if (textContent) {
if (isInline(parentElement) || isBlock(previousElement)) {
textContent = `${textContent}`;
} else {
textContent = `\n${textContent}`;
}
docFragText.push(textContent);
}
}
});
console.log(docFragText.join(""));
h2
来单独获取标题的内容。 - html_programmer