根据其他答案和这个问题的评论,大多数来这里的人似乎真正寻求的是一个通用解决方案,可以提供HTML文档的格式良好的纯文本表示。我知道我也是。
幸运的是,JSoup已经提供了一个非常全面的示例,说明如何实现这一点:HtmlToPlainText.java
示例中的FormattingVisitor
可以轻松地按照您的喜好进行调整,并处理大多数块元素和换行。
为避免链接失效,这里是Jonathan Hedley的完整解决方案:
package org.jsoup.examples;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import java.io.IOException;
public class HtmlToPlainText {
private static final String userAgent = "Mozilla/5.0 (jsoup)";
private static final int timeout = 5 * 1000;
public static void main(String... args) throws IOException {
Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]");
final String url = args[0];
final String selector = args.length == 2 ? args[1] : null;
Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get();
HtmlToPlainText formatter = new HtmlToPlainText();
if (selector != null) {
Elements elements = doc.select(selector);
for (Element element : elements) {
String plainText = formatter.getPlainText(element);
System.out.println(plainText);
}
} else {
String plainText = formatter.getPlainText(doc);
System.out.println(plainText);
}
}
public String getPlainText(Element element) {
FormattingVisitor formatter = new FormattingVisitor();
NodeTraversor traversor = new NodeTraversor(formatter);
traversor.traverse(element);
return formatter.toString();
}
private class FormattingVisitor implements NodeVisitor {
private static final int maxWidth = 80;
private int width = 0;
private StringBuilder accum = new StringBuilder();
public void head(Node node, int depth) {
String name = node.nodeName();
if (node instanceof TextNode)
append(((TextNode) node).text());
else if (name.equals("li"))
append("\n * ");
else if (name.equals("dt"))
append(" ");
else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
append("\n");
}
public void tail(Node node, int depth) {
String name = node.nodeName();
if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))
append("\n");
else if (name.equals("a"))
append(String.format(" <%s>", node.absUrl("href")));
}
private void append(String text) {
if (text.startsWith("\n"))
width = 0;
if (text.equals(" ") &&
(accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))
return;
if (text.length() + width > maxWidth) {
String words[] = text.split("\\s+");
for (int i = 0; i < words.length; i++) {
String word = words[i];
boolean last = i == words.length - 1;
if (!last)
word = word + " ";
if (word.length() + width > maxWidth) {
accum.append("\n").append(word);
width = word.length();
} else {
accum.append(word);
width += word.length();
}
}
} else {
accum.append(text);
width += text.length();
}
}
@Override
public String toString() {
return accum.toString();
}
}
}