我在我的C# winform应用程序中使用iTextSharp。我想要获取PDF文件中的特定段落。在iTextSharp中是否有可能实现这一点?
draw "the cat in the hat" at 10,10
或者是draw "t" at 10,10, then draw "h" at 14,10, then draw "e" at 18,10
等等。这在使用Adobe InDesign等强大设计程序生成的PDF中非常常见。PdfTextExtractor
的类,它有一个名为GetTextFromPage
的方法,可以获取页面上的所有原始文本。此方法的最后一个参数是实现ITextExtractionStrategy
接口的对象。如果您创建自己的类来实现此接口,可以处理每个文本串并执行自己的逻辑。RenderText
的方法,它将针对每个文本串进行调用。您将得到一个iTextSharp.text.pdf.parser.TextRenderInfo
对象,从中可以获取文本串的原始文本以及其他内容,例如当前起始坐标、当前字体等。由于视觉文本行可以由多个文本串组成,因此可以使用此方法将运行的基线(起始x坐标)与上一个运行进行比较,以确定它是否属于同一视觉行。 public class TextAsParagraphsExtractionStrategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy {
//Text buffer
private StringBuilder result = new StringBuilder();
//Store last used properties
private Vector lastBaseLine;
//Buffer of lines of text and their Y coordinates. NOTE, these should be exposed as properties instead of fields but are left as is for simplicity's sake
public List<string> strings = new List<String>();
public List<float> baselines = new List<float>();
//This is called whenever a run of text is encountered
public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo) {
//This code assumes that if the baseline changes then we're on a newline
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
//See if the baseline has changed
if ((this.lastBaseLine != null) && (curBaseline[Vector.I2] != lastBaseLine[Vector.I2])) {
//See if we have text and not just whitespace
if ((!String.IsNullOrWhiteSpace(this.result.ToString()))) {
//Mark the previous line as done by adding it to our buffers
this.baselines.Add(this.lastBaseLine[Vector.I2]);
this.strings.Add(this.result.ToString());
}
//Reset our "line" buffer
this.result.Clear();
}
//Append the current text to our line buffer
this.result.Append(renderInfo.GetText());
//Reset the last used line
this.lastBaseLine = curBaseline;
}
public string GetResultantText() {
//One last time, see if there's anything left in the buffer
if ((!String.IsNullOrWhiteSpace(this.result.ToString()))) {
this.baselines.Add(this.lastBaseLine[Vector.I2]);
this.strings.Add(this.result.ToString());
}
//We're not going to use this method to return a string, instead after callers should inspect this class's strings and baselines fields.
return null;
}
//Not needed, part of interface contract
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}
PdfReader reader = new PdfReader(workingFile);
TextAsParagraphsExtractionStrategy S = new TextAsParagraphsExtractionStrategy();
iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
for (int i = 0; i < S.strings.Count; i++) {
Console.WriteLine("Line {0,-5}: {1}", S.baselines[i], S.strings[i]);
}
GetTextFromPage
获取的价值,而是检查工作线程的baselines
和strings
数组字段。下一步是比较基线并尝试确定如何将行分组以成为段落。 using (FileStream fs = new FileStream(workingFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
using (Document doc = new Document(PageSize.LETTER)) {
using (PdfWriter writer = PdfWriter.GetInstance(doc, fs)) {
doc.Open();
doc.Add(new Paragraph("Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Maecenas porttitor congue massa. Fusce posuere, magna sed pulvinar ultricies, purus lectus malesuada libero, sit amet commodo magna eros quis urna."));
doc.Add(new Paragraph("This"));
doc.Add(new Paragraph("Is"));
doc.Add(new Paragraph("A"));
doc.Add(new Paragraph("Test"));
doc.Close();
}
}
}