如何在Java中读取Doc或Docx文件?

23

我想在Java中读取一个Word文件

import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hwpf.*;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.usermodel.HeaderStories;

import java.io.*;

public class ReadDocFileFromJava {

    public static void main(String[] args) {
        /**This is the document that you want to read using Java.**/
        String fileName = "C:\\Path to file\\Test.doc";

        /**Method call to read the document (demonstrate some useage of POI)**/
        readMyDocument(fileName);

    }
    public static void readMyDocument(String fileName){
        POIFSFileSystem fs = null;
        try {
            fs = new POIFSFileSystem(new FileInputStream(fileName));
            HWPFDocument doc = new HWPFDocument(fs);

            /** Read the content **/
            readParagraphs(doc);

            int pageNumber=1;

            /** We will try reading the header for page 1**/
            readHeader(doc, pageNumber);

            /** Let's try reading the footer for page 1**/
            readFooter(doc, pageNumber);

            /** Read the document summary**/
            readDocumentSummary(doc);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }  

    public static void readParagraphs(HWPFDocument doc) throws Exception{
        WordExtractor we = new WordExtractor(doc);

        /**Get the total number of paragraphs**/
        String[] paragraphs = we.getParagraphText();
        System.out.println("Total Paragraphs: "+paragraphs.length);

        for (int i = 0; i < paragraphs.length; i++) {

            System.out.println("Length of paragraph "+(i +1)+": "+ paragraphs[i].length());
            System.out.println(paragraphs[i].toString());

        }

    }

    public static void readHeader(HWPFDocument doc, int pageNumber){
        HeaderStories headerStore = new HeaderStories( doc);
        String header = headerStore.getHeader(pageNumber);
        System.out.println("Header Is: "+header);

    }

    public static void readFooter(HWPFDocument doc, int pageNumber){
        HeaderStories headerStore = new HeaderStories( doc);
        String footer = headerStore.getFooter(pageNumber);
        System.out.println("Footer Is: "+footer);

    }

    public static void readDocumentSummary(HWPFDocument doc) {
        DocumentSummaryInformation summaryInfo=doc.getDocumentSummaryInformation();
        String category = summaryInfo.getCategory();
        String company = summaryInfo.getCompany();
        int lineCount=summaryInfo.getLineCount();
        int sectionCount=summaryInfo.getSectionCount();
        int slideCount=summaryInfo.getSlideCount();


    enter code here
        System.out.println("---------------------------");
        System.out.println("Category: "+category);
        System.out.println("Company: "+company);
        System.out.println("Line Count: "+lineCount);
        System.out.println("Section Count: "+sectionCount);
        System.out.println("Slide Count: "+slideCount);

    }

}

http://sanjaal.com/java/tag/java-and-docx-format/

我想在Java中读取一个doc或docx文件。


2
你在这里并没有真正提出问题。如果没有更多的细节,这个问题很可能会被关闭。你的目标是什么(查看、处理、编辑、打印)?你已经尝试了什么?有什么不起作用的地方吗?你是否收到错误信息? - OverZealous
1个回答

28

以下是ReadDoc/docx.java的代码:这将读取一个doc/docx文件并将其内容打印到控制台,您可以按照自己的方式进行自定义。

import java.io.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;

public class ReadDocFile
{
    public static void main(String[] args)
    {
        File file = null;
        WordExtractor extractor = null;
        try
        {

            file = new File("c:\\New.doc");
            FileInputStream fis = new FileInputStream(file.getAbsolutePath());
            HWPFDocument document = new HWPFDocument(fis);
            extractor = new WordExtractor(document);
            String[] fileData = extractor.getParagraphText();
            for (int i = 0; i < fileData.length; i++)
            {
                if (fileData[i] != null)
                    System.out.println(fileData[i]);
            }
        }
        catch (Exception exep)
        {
            exep.printStackTrace();
        }
    }
}

11
你可能想要处理那个异常。 - Thorbjørn Ravn Andersen
1
在这种情况下,单词提取器只能提供文档文件的文本内容。它甚至没有提到段落的起始或结束位置... - Wolverine
1
除了文本内容(图像、条形码等),其他内容怎么样?您能否进一步编辑代码以读取完整数据? - Dinesh Kumar P
这个方法是否有办法通过偏移量知道页面编号? - Muneem Habib
2
这仅适用于2007年之前的Office版本。 - achAmháin

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接