在Java中读取巨大的Excel文件(500K行)

9

我正在尝试读取一个大型XLSX文件。这个Excel文件大约有50万行。我需要读取第二列。

OPCPackage pkg;
pkg = OPCPackage.open("File path");
XSSFWorkbook myWorkBook = new XSSFWorkbook(pkg);
Sheet sheet = myWorkBook.getSheetAt(2);
Iterator<Row> rowIterator = sheet.iterator();
while (rowIterator.hasNext())
{
Row row = rowIterator.next();
if (row_num > ROW_ESCAPE) 
{
   Cell cell = row.getCell(2);
  if (!cell.getStringCellValue().toString().trim().isEmpty()) 
            {
                System.out.println(cell.getStringCellValue().toString());
            }
System.out.println("hi"+row_num);
        }
        row_num++;
 }

它打印到第39723行,之后抛出以下异常

Exception in thread "AWT-EventQueue-0" java.lang.OutOfMemoryError: Java heap space
at java.util.regex.Matcher.<init>(Matcher.java:225)
at java.util.regex.Pattern.matcher(Pattern.java:1093)
at org.apache.poi.xssf.usermodel.XSSFRichTextString.utfDecode(XSSFRichTextString.java:482)
at org.apache.poi.xssf.usermodel.XSSFRichTextString.getString(XSSFRichTextString.java:297)
at org.apache.poi.xssf.usermodel.XSSFCell.getStringCellValue(XSSFCell.java:262)
at Main.get_titles(Main.java:484)
at Main.analyze_Importsheet(Main.java:461)
at Main.but_sel_imp_sheetActionPerformed(Main.java:220)
at Main.access$000(Main.java:40)
at Main$1.actionPerformed(Main.java:85)
at javax.swing.AbstractButton.fireActionPerformed(AbstractButton.java:2022)
at javax.swing.AbstractButton$Handler.actionPerformed(AbstractButton.java:2348)
at javax.swing.DefaultButtonModel.fireActionPerformed(DefaultButtonModel.java:402)
at javax.swing.DefaultButtonModel.setPressed(DefaultButtonModel.java:259)
at javax.swing.plaf.basic.BasicButtonListener.mouseReleased(BasicButtonListener.java:252)
at java.awt.Component.processMouseEvent(Component.java:6533)
at javax.swing.JComponent.processMouseEvent(JComponent.java:3324)
at java.awt.Component.processEvent(Component.java:6298)
at java.awt.Container.processEvent(Container.java:2236)
at java.awt.Component.dispatchEventImpl(Component.java:4889)
at java.awt.Container.dispatchEventImpl(Container.java:2294)
at java.awt.Component.dispatchEvent(Component.java:4711)
at java.awt.LightweightDispatcher.retargetMouseEvent(Container.java:4888)
at java.awt.LightweightDispatcher.processMouseEvent(Container.java:4525)
at java.awt.LightweightDispatcher.dispatchEvent(Container.java:4466)
at java.awt.Container.dispatchEventImpl(Container.java:2280)
at java.awt.Window.dispatchEventImpl(Window.java:2746)
at java.awt.Component.dispatchEvent(Component.java:4711)
at java.awt.EventQueue.dispatchEventImpl(EventQueue.java:758)
at java.awt.EventQueue.access$500(EventQueue.java:97)
at java.awt.EventQueue$3.run(EventQueue.java:709)
at java.awt.EventQueue$3.run(EventQueue.java:703)

Main.java:484=如果不加这行代码(if (!cell.getStringCellValue().toString().trim().isEmpty())),只是打印行号,程序可以正常运行。我需要帮忙获取第二列的字符串值。

5个回答

5

1

您需要查看此链接 https://github.com/monitorjbl/excel-streaming-reader

您可以像这样编写代码

InputStream is = new FileInputStream(new File("/path/to/workbook.xlsx"));
Workbook workbook = StreamingReader.builder()
        .rowCacheSize(100)    // number of rows to keep in memory (defaults to 10)
        .bufferSize(4096)     // buffer size to use when reading InputStream to file (defaults to 1024)
        .open(is);  

      // InputStream or File for XLSX file (required)

1

我的Excel表格中有一些隐藏的工作表。使用流无法读取这些工作表。 XSSFWorkbook oldWorkbook; OPCPackage pkg; pkg = OPCPackage.open(myImport.get_path()); oldWorkbook = (XSSFWorkbook) WorkbookFactory.create(pkg); 昨天那段代码还能运行,但今天却出现了堆大小错误并停止工作。 - Rajib_Podder

0

增加JVM的堆大小可能会解决您的OutOfMemoryError问题。请参见此stackoverflow帖子以了解如何增加JVM的堆大小。


我忘了提到。我已经使用了java -Xmx1G -jar Importsheet_Breaker.jar。 - Rajib_Podder

0

这个库可以从Maven Central获取,你也可以选择自己安装。

<!-- POI for parsing Excel files-->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>4.1.2</version>
    </dependency>

    <!-- POI-ooxml -->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>4.1.2</version>
    </dependency>

    <!-- For reading very large Excel file -->
    <dependency>
        <groupId>com.monitorjbl</groupId>
        <artifactId>xlsx-streamer</artifactId>
        <version>2.1.0</version>
    </dependency>

使用它,将其添加到您的 POM 中: 只需将代码与类名 ReadLargeFile.java 粘贴在一起,就能见识神奇之处。

import code.axis.properties.ConfigReader;
import com.monitorjbl.xlsx.StreamingReader;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.NumberToTextConverter;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.regex.Pattern;

public class ReadLargeFile {
     public static void main(String[] args) {
       try (InputStream inputStream = new FileInputStream(new File("C:/Users/Nischal/Desktop/Qualtiy Assurance of Data Clener/Extra Large Files/update_fileName01-26-2021-6-34-49.XLSX"))) { //FilePath from your device
        Workbook workbook = StreamingReader.builder().rowCacheSize(200).bufferSize(4096).open(inputStream);
        for (Sheet sheet : workbook) {
            for (Row row : sheet) {
                for (Cell cell : row) {
                    String cellValue = getStringCellValue(cell);
                    System.out.println(cellValue);
                }
            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

private static String getStringCellValue(Cell cell) {
    try {
        switch (cell.getCellType()) {
            case FORMULA:
                try {
                    return NumberToTextConverter.toText(cell.getNumericCellValue());
                } catch (NumberFormatException e) {
                    return cell.getStringCellValue();
                }
            case NUMERIC:
                return NumberToTextConverter.toText(cell.getNumericCellValue());
            case STRING:
                String cellValue = cell.getStringCellValue().trim();
                String pattern = "\\^\\$?-?([1-9][0-9]{0,2}(,\\d{3})*(\\.\\d{0,2})?|[1-9]\\d*(\\.\\d{0,2})?|0(\\.\\d{0,2})?|(\\.\\d{1,2}))$|^-?\\$?([1-9]\\d{0,2}(,\\d{3})*(\\.\\d{0,2})?|[1-9]\\d*(\\.\\d{0,2})?|0(\\.\\d{0,2})?|(\\.\\d{1,2}))$|^\\(\\$?([1-9]\\d{0,2}(,\\d{3})*(\\.\\d{0,2})?|[1-9]\\d*(\\.\\d{0,2})?|0(\\.\\d{0,2})?|(\\.\\d{1,2}))\\)$";
                if (((Pattern.compile(pattern)).matcher(cellValue)).find()) {
                    return cellValue.replaceAll("[^\\d.]", "");
                }
                return cellValue.trim();
            case BOOLEAN:
                return String.valueOf(cell.getBooleanCellValue());
            case ERROR:
                return null;
            default:
                return cell.getStringCellValue();
        }
    } catch (Exception e) {
        if (e.getLocalizedMessage() != null && ConfigReader.isDisplayWarnLog())
            return "";
    }
    return "";
}
}

无论Excel文件有多大,该代码都会逐个打印单元格的值。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接