有没有一种简单的方法将Excel内联字符串转换为Java中的共享字符串表?

时间:2017-12-20 06:08:50

标签: java excel

我正在尝试创建一个简单的Java程序,将Excel文件从内联字符串转换为共享字符串表,以减小文件大小。

我知道Apache POI有一个SXSSFWorkbook类可以完成这项工作,但使用SAX XML解析器读取带有内联字符串的大型xlsx文件仍然会爆炸。例如150,000行×50列单元格。

有没有使用Apache POI库来完成简单工作的简单解决方案?有人知道吗?

2 个答案:

答案 0 :(得分:1)

Adding a row to a large xlsx file (Out of Memory)中,我提供了一种方法,使用StAX将行写入Excel表,而无需打开整个工作簿。但是使用了共享字符串表。

所以这是一个稍微修改过的版本。

您将开始像这样ReadAndWriteTest.xlsx

enter image description here

每次运行代码时,将添加100,000行,其中A列中包含随机字符串,B列中包含随机双精度值。字符串将由共享字符串表管理。因此,共享字符串表中的唯一字符串将少于字符串在表中的总和。

我在生产中使用这种方法,确保代码中更复杂,更结构化,因为此代码示例仅在简单代码中显示方法。它运作良好,性能比SXSSF更高,并提供阅读写作。

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;

import org.apache.poi.xssf.model.SharedStringsTable;

import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;

import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.XMLEvent;

import javax.xml.namespace.QName;

import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import java.util.concurrent.ThreadLocalRandom;

class StaxReadAndWriteTest {

 public static void main(String[] args) {
  try {

   String loremipsum = "Lorem ipsum dolor sit amet ne mei euismod interpretaris est te iusto causae doctus.";

   File file = new File("ReadAndWriteTest.xlsx");
   OPCPackage opcpackage = OPCPackage.open(file);

   //if there are strings in the sheet data, we need the SharedStringsTable
   PackagePart sharedstringstablepart = opcpackage.getPartsByName(Pattern.compile("/xl/sharedStrings.xml")).get(0);
   SharedStringsTable sharedstringstable = new SharedStringsTable();
   sharedstringstable.readFrom(sharedstringstablepart.getInputStream());

   PackagePart sheetpart = opcpackage.getPartsByName(Pattern.compile("/xl/worksheets/sheet1.xml")).get(0);

   XMLEventReader reader = XMLInputFactory.newInstance().createXMLEventReader(sheetpart.getInputStream());
   XMLEventWriter writer = XMLOutputFactory.newInstance().createXMLEventWriter(sheetpart.getOutputStream());

   XMLEventFactory eventFactory = XMLEventFactory.newInstance();

   int rowsCount = 0;

   while(reader.hasNext()){ //loop over all XML in sheet1.xml
    XMLEvent event = (XMLEvent)reader.next();
    writer.add(event); //by default write each readed event

    if(event.isStartElement()){
     StartElement startElement = (StartElement)event;
     QName startElementName = startElement.getName();
     if(startElementName.getLocalPart().equalsIgnoreCase("row")) { //start element of row
      boolean rowStart = true;
      rowsCount++;
      do {
       event = (XMLEvent)reader.next(); //find this row's end
       writer.add(event); //by default write each readed event

       if(event.isEndElement()){
        EndElement endElement = (EndElement)event;
        QName endElementName = endElement.getName();
        if(endElementName.getLocalPart().equalsIgnoreCase("row")) { //end element of row
         rowStart = false;
         //we assume that there is nothing else (character data) between end element of row and next element 
         XMLEvent nextElement = (XMLEvent)reader.peek();
         QName nextElementName = null;
         if (nextElement.isStartElement()) nextElementName = ((StartElement)nextElement).getName();
         else if (nextElement.isEndElement()) nextElementName = ((EndElement)nextElement).getName();
         if(!nextElementName.getLocalPart().equalsIgnoreCase("row")) { //next is not start element of row
          //we have the last row, so we write new rows now 

          for (int i = 0; i < 100000; i++) {

           StartElement newRowStart = eventFactory.createStartElement(new QName("row"), null, null);
           writer.add(newRowStart);

//start cell A
           Attribute attribute = eventFactory.createAttribute("t", "s");
           List attributeList = Arrays.asList(attribute);
           StartElement newCellStart = eventFactory.createStartElement(new QName("c"), attributeList.iterator(), null);
           writer.add(newCellStart);

           CTRst ctstr = CTRst.Factory.newInstance();

           //create a random string from loremipsum
           int length = ThreadLocalRandom.current().nextInt(5, 20);
           int index = ThreadLocalRandom.current().nextInt(0, loremipsum.length() - length);
           //set randoom string in CTRst
           ctstr.setT(loremipsum.substring(index, index + length).trim());
           //update SharedStringsTable with CTRst and get sRef as the ID of this string
           int sRef = sharedstringstable.addEntry(ctstr);

           StartElement newCellValue = eventFactory.createStartElement(new QName("v"), null, null);
           writer.add(newCellValue);

           //set sRef of the string as content of cell A
           Characters value = eventFactory.createCharacters(Integer.toString(sRef));
           writer.add(value);         

           EndElement newCellValueEnd = eventFactory.createEndElement(new QName("v"), null);
           writer.add(newCellValueEnd);

           EndElement newCellEnd = eventFactory.createEndElement(new QName("c"), null);
           writer.add(newCellEnd);
//end cell A
//start cell B
           newCellStart = eventFactory.createStartElement(new QName("c"), null, null);
           writer.add(newCellStart);

           newCellValue = eventFactory.createStartElement(new QName("v"), null, null);
           writer.add(newCellValue);

           //set random double value as content of cell B
           value = eventFactory.createCharacters(""+ThreadLocalRandom.current().nextDouble((double)length));
           writer.add(value);         

           newCellValueEnd = eventFactory.createEndElement(new QName("v"), null);
           writer.add(newCellValueEnd);

           newCellEnd = eventFactory.createEndElement(new QName("c"), null);
           writer.add(newCellEnd);
//end cell B

           EndElement newRowEnd = eventFactory.createEndElement(new QName("row"), null);
           writer.add(newRowEnd);

           rowsCount++;
          }
         }
        }
       }
      } while (rowStart);
     }
    }
   }

   writer.flush();

   //write the SharedStringsTable
   OutputStream out = sharedstringstablepart.getOutputStream();
   sharedstringstable.writeTo(out);
   out.close();

   opcpackage.close();

  } catch (Exception ex) {
     ex.printStackTrace();
  }
 }
}

答案 1 :(得分:1)

虽然首先创建包含内联字符串的工作表,然后用共享字符串替换这些内联字符串将会非常低效,我将提供一个如何完成问题的答案。

需要:遍历工作表的XML文件中的所有单元格以获取内联字符串。然后查找sharedStrings.xml字符串是否已存在。如果是,请获取ID,否则在sharedStrings.xml中创建一个新字符串并获取ID。然后将ID放入工作表的XML文件中的单元格中,而不是内联字符串值。

以下代码正在执行此操作。如果TestInlineStrings.xlsx在第一个工作表中有内联字符串,那么在运行此代码后,这些内联字符串将替换为共享字符串。

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;

import org.apache.poi.xssf.model.SharedStringsTable;

import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;

import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.XMLEvent;

import javax.xml.namespace.QName;

import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import java.util.Iterator;

class StaxReplaceInlineStrings {

 public static void main(String[] args) {
  try {

   File file = new File("TestInlineStrings.xlsx");
   OPCPackage opcpackage = OPCPackage.open(file);

   //if there are strings in the sheet data, we need the SharedStringsTable
   PackagePart sharedstringstablepart = opcpackage.getPartsByName(Pattern.compile("/xl/sharedStrings.xml")).get(0);
   SharedStringsTable sharedstringstable = new SharedStringsTable();
   sharedstringstable.readFrom(sharedstringstablepart.getInputStream());

   PackagePart sheetpart = opcpackage.getPartsByName(Pattern.compile("/xl/worksheets/sheet1.xml")).get(0);

   XMLEventReader reader = XMLInputFactory.newInstance().createXMLEventReader(sheetpart.getInputStream());
   XMLEventWriter writer = XMLOutputFactory.newInstance().createXMLEventWriter(sheetpart.getOutputStream());

   XMLEventFactory eventFactory = XMLEventFactory.newInstance();


   while(reader.hasNext()){ //loop over all XML in sheet1.xml

    boolean cellReplaced = false; //marker whether cell having inline string was replaced by cell having shared string

    XMLEvent event = (XMLEvent)reader.next();
    if(event.isStartElement()){
     StartElement startElement = (StartElement)event;
     QName startElementName = startElement.getName();
     if (startElementName.getLocalPart().equalsIgnoreCase("c")) { //start element of cell
      Attribute attribute;
      StartElement cellStart = startElement; //remember cell start
      Iterator attributeIterator = cellStart.getAttributes(); //get cell's attributes
      while (attributeIterator.hasNext()) {
       attribute = (Attribute)attributeIterator.next();
       if ("t".equals(attribute.getName().getLocalPart())) { //cell has type attribute
        String tvalue = attribute.getValue();
        if ("inlineStr".equals(tvalue)) { //cell type is inline string
         String inlineString = "";
         startElement = (StartElement)(XMLEvent)reader.next(); //read next start element - error if is not a start element
         startElementName = startElement.getName();
         if (startElementName.getLocalPart().equalsIgnoreCase("is")) { //start element of inline string     
          startElement = (StartElement)(XMLEvent)reader.next(); //read next start element - error if is not a start element
          startElementName = startElement.getName();
          if (startElementName.getLocalPart().equalsIgnoreCase("t")) { //start element of text
           Characters characters = (Characters)(XMLEvent)reader.next(); //read next characters element - error if is not a characters element   
           inlineString = characters.getData(); //get text data  
System.out.println(inlineString); 
          }
         }

         //create shared string in shared strings table
         CTRst ctstr = CTRst.Factory.newInstance();
         ctstr.setT(inlineString);
         int sRef = sharedstringstable.addEntry(ctstr);

         //we are replacing the cell element so skip elements until end element of cell
         while(reader.hasNext()) {
          event = (XMLEvent)reader.next();
          if(event.isEndElement()){
           EndElement endElement = (EndElement)event;
           QName endElementName = endElement.getName();
           if (endElementName.getLocalPart().equalsIgnoreCase("c")) { //end element of cell 
            break;
           }
          }
         }

         //create the new cell element having the shared string
         Attribute r = cellStart.getAttributeByName(new QName("r"));
         Attribute s = cellStart.getAttributeByName(new QName("s"));
         Attribute t = eventFactory.createAttribute("t", "s");
         List attributeList = Arrays.asList(new Attribute[]{t});
         if (r != null && s != null) {
          attributeList = Arrays.asList(new Attribute[]{r, s, t});
         } else if (r != null) {
          attributeList = Arrays.asList(new Attribute[]{r, t});
         } else if (s != null) {
          attributeList = Arrays.asList(new Attribute[]{s, t});
         }
System.out.println(attributeList);
         StartElement newCellStart = eventFactory.createStartElement(new QName("c"), attributeList.iterator(), null);
         writer.add(newCellStart);
         StartElement newCellValue = eventFactory.createStartElement(new QName("v"), null, null);
         writer.add(newCellValue);
         Characters value = eventFactory.createCharacters(Integer.toString(sRef));
         writer.add(value);         
         EndElement newCellValueEnd = eventFactory.createEndElement(new QName("v"), null);
         writer.add(newCellValueEnd);
         EndElement newCellEnd = eventFactory.createEndElement(new QName("c"), null);
         writer.add(newCellEnd);

         cellReplaced = true; // mark that cell was replaced
         break;
        }
       } 
      }
     }
    }
    if (!cellReplaced) {
     writer.add(event); //by default write each read event, except cell was replaced
    }
   }
   writer.flush();

   //write the SharedStringsTable
   OutputStream out = sharedstringstablepart.getOutputStream();
   sharedstringstable.writeTo(out);
   out.close();

   opcpackage.close();

  } catch (Exception ex) {
     ex.printStackTrace();
  }
 }
}