在java中阅读巨大的Excel文件(500K行)

时间:2017-02-22 00:50:35

标签: java excel apache-poi

我正在尝试阅读Big XLSX文件。 Excel文件大约有500K行。我需要阅读第2列。

OPCPackage pkg;
pkg = OPCPackage.open("File path");
XSSFWorkbook myWorkBook = new XSSFWorkbook(pkg);
Sheet sheet = myWorkBook.getSheetAt(2);
Iterator<Row> rowIterator = sheet.iterator();
while (rowIterator.hasNext())
{
Row row = rowIterator.next();
if (row_num > ROW_ESCAPE) 
{
   Cell cell = row.getCell(2);
  if (!cell.getStringCellValue().toString().trim().isEmpty()) 
            {
                System.out.println(cell.getStringCellValue().toString());
            }
System.out.println("hi"+row_num);
        }
        row_num++;
 }

打印到第39723行 之后它抛出以下异常

Exception in thread "AWT-EventQueue-0" java.lang.OutOfMemoryError: Java heap space
at java.util.regex.Matcher.<init>(Matcher.java:225)
at java.util.regex.Pattern.matcher(Pattern.java:1093)
at org.apache.poi.xssf.usermodel.XSSFRichTextString.utfDecode(XSSFRichTextString.java:482)
at org.apache.poi.xssf.usermodel.XSSFRichTextString.getString(XSSFRichTextString.java:297)
at org.apache.poi.xssf.usermodel.XSSFCell.getStringCellValue(XSSFCell.java:262)
at Main.get_titles(Main.java:484)
at Main.analyze_Importsheet(Main.java:461)
at Main.but_sel_imp_sheetActionPerformed(Main.java:220)
at Main.access$000(Main.java:40)
at Main$1.actionPerformed(Main.java:85)
at javax.swing.AbstractButton.fireActionPerformed(AbstractButton.java:2022)
at javax.swing.AbstractButton$Handler.actionPerformed(AbstractButton.java:2348)
at javax.swing.DefaultButtonModel.fireActionPerformed(DefaultButtonModel.java:402)
at javax.swing.DefaultButtonModel.setPressed(DefaultButtonModel.java:259)
at javax.swing.plaf.basic.BasicButtonListener.mouseReleased(BasicButtonListener.java:252)
at java.awt.Component.processMouseEvent(Component.java:6533)
at javax.swing.JComponent.processMouseEvent(JComponent.java:3324)
at java.awt.Component.processEvent(Component.java:6298)
at java.awt.Container.processEvent(Container.java:2236)
at java.awt.Component.dispatchEventImpl(Component.java:4889)
at java.awt.Container.dispatchEventImpl(Container.java:2294)
at java.awt.Component.dispatchEvent(Component.java:4711)
at java.awt.LightweightDispatcher.retargetMouseEvent(Container.java:4888)
at java.awt.LightweightDispatcher.processMouseEvent(Container.java:4525)
at java.awt.LightweightDispatcher.dispatchEvent(Container.java:4466)
at java.awt.Container.dispatchEventImpl(Container.java:2280)
at java.awt.Window.dispatchEventImpl(Window.java:2746)
at java.awt.Component.dispatchEvent(Component.java:4711)
at java.awt.EventQueue.dispatchEventImpl(EventQueue.java:758)
at java.awt.EventQueue.access$500(EventQueue.java:97)
at java.awt.EventQueue$3.run(EventQueue.java:709)
at java.awt.EventQueue$3.run(EventQueue.java:703)

Main.java:484=if(!cell.getStringCellValue()。toString()。trim()。isEmpty()) 如果我删除该行并只打印行号,它可以正常工作。 我需要帮助如何获取col 2的字符串值。

5 个答案:

答案 0 :(得分:3)

使用流 eventmodel API,而不是内存中的用户模型API。

请参阅Apache POI: How to use the HSSF Event API

答案 1 :(得分:0)

增加JVM的堆大小可能会修复OutOfMemoryError。有关如何增加JVM堆大小的信息,请参阅this stackoverflow post

答案 2 :(得分:0)

最简单的方法(不使用读取逻辑)将增加堆大小。

如果这对您来说不是一个可行的选择,请使用流。 实际上,已经有一个方便的图书馆。

https://github.com/monitorjbl/excel-streaming-reader

答案 3 :(得分:0)

您需要查看此https://github.com/monitorjbl/excel-streaming-reader

您可以像这样

InputStream is = new FileInputStream(new File("/path/to/workbook.xlsx"));
Workbook workbook = StreamingReader.builder()
        .rowCacheSize(100)    // number of rows to keep in memory (defaults to 10)
        .bufferSize(4096)     // buffer size to use when reading InputStream to file (defaults to 1024)
        .open(is);  

      // InputStream or File for XLSX file (required)

答案 4 :(得分:0)

该库可从 Maven Central 获得,您也可以选择自行安装。

<!-- POI for parsing Excel files-->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>4.1.2</version>
    </dependency>

    <!-- POI-ooxml -->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>4.1.2</version>
    </dependency>

    <!-- For reading very large Excel file -->
    <dependency>
        <groupId>com.monitorjbl</groupId>
        <artifactId>xlsx-streamer</artifactId>
        <version>2.1.0</version>
    </dependency>

要使用它,请将其添加到您的 POM: 只需粘贴类名为 ReadLargeFile.java 的代码,即可见效,

import code.axis.properties.ConfigReader;
import com.monitorjbl.xlsx.StreamingReader;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.NumberToTextConverter;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.regex.Pattern;

public class ReadLargeFile {
     public static void main(String[] args) {
       try (InputStream inputStream = new FileInputStream(new File("C:/Users/Nischal/Desktop/Qualtiy Assurance of Data Clener/Extra Large Files/update_fileName01-26-2021-6-34-49.XLSX"))) { //FilePath from your device
        Workbook workbook = StreamingReader.builder().rowCacheSize(200).bufferSize(4096).open(inputStream);
        for (Sheet sheet : workbook) {
            for (Row row : sheet) {
                for (Cell cell : row) {
                    String cellValue = getStringCellValue(cell);
                    System.out.println(cellValue);
                }
            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

private static String getStringCellValue(Cell cell) {
    try {
        switch (cell.getCellType()) {
            case FORMULA:
                try {
                    return NumberToTextConverter.toText(cell.getNumericCellValue());
                } catch (NumberFormatException e) {
                    return cell.getStringCellValue();
                }
            case NUMERIC:
                return NumberToTextConverter.toText(cell.getNumericCellValue());
            case STRING:
                String cellValue = cell.getStringCellValue().trim();
                String pattern = "\\^\\$?-?([1-9][0-9]{0,2}(,\\d{3})*(\\.\\d{0,2})?|[1-9]\\d*(\\.\\d{0,2})?|0(\\.\\d{0,2})?|(\\.\\d{1,2}))$|^-?\\$?([1-9]\\d{0,2}(,\\d{3})*(\\.\\d{0,2})?|[1-9]\\d*(\\.\\d{0,2})?|0(\\.\\d{0,2})?|(\\.\\d{1,2}))$|^\\(\\$?([1-9]\\d{0,2}(,\\d{3})*(\\.\\d{0,2})?|[1-9]\\d*(\\.\\d{0,2})?|0(\\.\\d{0,2})?|(\\.\\d{1,2}))\\)$";
                if (((Pattern.compile(pattern)).matcher(cellValue)).find()) {
                    return cellValue.replaceAll("[^\\d.]", "");
                }
                return cellValue.trim();
            case BOOLEAN:
                return String.valueOf(cell.getBooleanCellValue());
            case ERROR:
                return null;
            default:
                return cell.getStringCellValue();
        }
    } catch (Exception e) {
        if (e.getLocalizedMessage() != null && ConfigReader.isDisplayWarnLog())
            return "";
    }
    return "";
}
}

无论excel文件多大,代码都会一一打印单元格值。