JavaScript 截取 HTML 文本

14

JavaScript有没有一种不需要匹配标签等复杂操作就能截断HTML文本的方法呢?

谢谢。

9个回答

24

3
这是我找到的唯一稳健的解决方案,应该被更高地评价。真不敢相信我成为了第一个点赞的人! - danii
1
惊人的@arendjr! - eballeste
1
npm i --save text-clipper 赢了 - boatcoder

11

我也遇到了同样的问题,最后写了以下代码来处理它。它可以将HTML截断至指定长度,清理任何在结尾可能被截断的开/闭标签,然后关闭剩余未关闭的标签:

function truncateHTML(text, length) {
    var truncated = text.substring(0, length);
    // Remove line breaks and surrounding whitespace
    truncated = truncated.replace(/(\r\n|\n|\r)/gm,"").trim();
    // If the text ends with an incomplete start tag, trim it off
    truncated = truncated.replace(/<(\w*)(?:(?:\s\w+(?:={0,1}(["']{0,1})\w*\2{0,1})))*$/g, '');
    // If the text ends with a truncated end tag, fix it.
    var truncatedEndTagExpr = /<\/((?:\w*))$/g;
    var truncatedEndTagMatch = truncatedEndTagExpr.exec(truncated);
    if (truncatedEndTagMatch != null) {
        var truncatedEndTag = truncatedEndTagMatch[1];
        // Check to see if there's an identifiable tag in the end tag
        if (truncatedEndTag.length > 0) {
            // If so, find the start tag, and close it
            var startTagExpr = new RegExp(
                "<(" + truncatedEndTag + "\\w?)(?:(?:\\s\\w+(?:=([\"\'])\\w*\\2)))*>");
            var testString = truncated;
            var startTagMatch = startTagExpr.exec(testString);

            var startTag = null;
            while (startTagMatch != null) {
                startTag = startTagMatch[1];
                testString = testString.replace(startTagExpr, '');
                startTagMatch = startTagExpr.exec(testString);
            }
            if (startTag != null) {
                truncated = truncated.replace(truncatedEndTagExpr, '</' + startTag + '>');
            }
        } else {
            // Otherwise, cull off the broken end tag
            truncated = truncated.replace(truncatedEndTagExpr, '');
        }
    }
    // Now the tricky part. Reverse the text, and look for opening tags. For each opening tag,
    //  check to see that he closing tag before it is for that tag. If not, append a closing tag.
    var testString = reverseHtml(truncated);
    var reverseTagOpenExpr = /<(?:(["'])\w*\1=\w+ )*(\w*)>/;
    var tagMatch = reverseTagOpenExpr.exec(testString);
    while (tagMatch != null) {
        var tag = tagMatch[0];
        var tagName = tagMatch[2];
        var startPos = tagMatch.index;
        var endPos = startPos + tag.length;
        var fragment = testString.substring(0, endPos);
        // Test to see if an end tag is found in the fragment. If not, append one to the end
        //  of the truncated HTML, thus closing the last unclosed tag
        if (!new RegExp("<" + tagName + "\/>").test(fragment)) {
            truncated += '</' + reverseHtml(tagName) + '>';
        }
        // Get rid of the already tested fragment
        testString = testString.replace(fragment, '');
        // Get another tag to test
        tagMatch = reverseTagOpenExpr.exec(testString);
    }
    return truncated;
}

function reverseHtml(str) {
    var ph = String.fromCharCode(206);
    var result = str.split('').reverse().join('');
    while (result.indexOf('<') > -1) {
        result = result.replace('<',ph);
    }
    while (result.indexOf('>') > -1) {
        result = result.replace('>', '<');
    }
    while (result.indexOf(ph) > -1) {
        result = result.replace(ph, '>');
    }
    return result;
}

它破坏了提供的HTML。测试使用:<h2>标题1</h2><h3><br>标题2</h3><h3> </h3><h4>标题3<br> </h4><p><strong>Lorem Ipsum</strong>是印刷和排版行业的模板文本。自1500年以来,Lorem Ipsum一直是这些行业使用的标准文本,当时有人混合了一个文本的字符以创建一本书的样本。这段文字不仅存活了5个世纪,而且还跳跃到电子排版,基本上保持不变。</p> - Carlos Oliveira

7

JavaScript本身没有提供相关功能。不过,您可以考虑使用jQuery插件来实现。


我还没有使用过jQuery,它是否容易实现或需要大量的设置和调整? - Francisc
jQuery插件链接已损坏。 - Jason

2

1

这适用于多层嵌套:

let truncate = (content, maxLength = 255, append = '…') => {
    let container = document.createElement('div');
    container.innerHTML = content;

    let limitReached = false;
    let counted = 0;

    let nodeHandler = node => {
        if ( limitReached ) {
            node.remove();
            return;
        }

        let childNodes = Array.from( node.childNodes );
        if ( childNodes.length ) {
            childNodes.forEach( childNode => nodeHandler( childNode ) );
        } else {
            counted += node.textContent.length;
            if ( counted >= maxLength ) {
                limitReached = true;
                if ( counted > maxLength ) {
                    node.textContent = node.textContent.slice( 0, -(counted - maxLength) )
                }

                node.textContent += append;
            }
        }
    };

    nodeHandler( container );

    return container.innerHTML;
};

1
如果你想使用纯JS实现一个轻量级的解决方案,这个应该可以胜任,但是它会留下空元素,所以取决于你是否关心这些。还要注意,它会就地修改节点。
function truncateNode(node, limit) {
  if (node.nodeType === Node.TEXT_NODE) {
    node.textContent = node.textContent.substring(0, limit);
    return limit - node.textContent.length;
  }

  node.childNodes.forEach((child) => {
    limit = truncateNode(child, limit);
  });

  return limit;
}

const span = document.createElement('span');
span.innerHTML = '<b>foo</b><i>bar</i><u>baz</u>';
truncateNode(span, 5);
expect(span.outerHTML).toEqual('<span><b>foo</b><i>ba</i><u></u></span>');

0

0

以上的解决方案都不完全符合我的使用情况,所以我自己创建了一个小型的原生JavaScript函数。它会留下空元素,但很容易进行修正。

const truncateWithHTML = (string, length) => {
    // string = "<span class='className'>My long string that</span> I want shorter<span> but just a little bit</span>"

    const noHTML = string.replace(/<[^>]*>/g, '');

    // if the string does not need to be truncated
    if (noHTML.length <= max){
        return string;
    }

    // if the string does not contains tags
    if (noHTML.length === string.length){
        // add <span title=""> to allow complete string to appear on hover
        return `<span title="${string}">${string.substring(0, max).trim()}…</span>`;
    }

    const substrings =  string.split(/(<[^>]*>)/g).filter(Boolean);
    // substrings = ["<span class='className'>","My long string that","</span>"," I want shorter","<span>"," but just a little bit","</span>"]

    let count = 0;
    let truncated = [];
    for (let i = 0; i < substrings.length; i++) {
        let substr = substrings[i];
        // if the substring isn't an HTML tag
        if (! substr.startsWith("<")){
            if (count > length){
                continue;
            } else if (substr.length > (length-count-1)){
                truncated.push(substr.substring(0, (length-count) - 1) + '…');
            } else {
                truncated.push(substr);
            }
            count += substr.length;
        } else {
            truncated.push(substr);
        }
    }

    return `<span title="${noHTML}">${truncated.join("")}…</span>`;
}

示例:

string = "<span class='className'>My long string that</span> I want shorter<span> but just a little bit</span>";

truncateWithHTML(string,10); // "<span title='My long string that I want shorter but just a little bit'><span class='className'>My long s…</span><span></span></span>"
truncateWithHTML(string,22); // "<span title='My long string that I want shorter but just a little bit'><span class='className'>My long string that</span> I…<span></span></span>"

-4

遗憾的是,它充满了HTML标记,否则这将不是一个问题。 - Francisc

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接