如何使用OpenXml从xlsx文件中提取文本

时间:2012-09-11 18:14:29

标签: c# openxml xlsx import-from-excel

我需要从xlsx文件中提取文本(放入数据库的全文索引)。 我使用以下代码:

using(SpreadsheetDocument d = SpreadsheetDocument.Open(stream, false)) {
 // Load the shared strings table.
 SharedStringTablePart stringTable = 
  d.WorkbookPart.GetPartsOfType<SharedStringTablePart>()
  .FirstOrDefault();
 if(stringTable == null) System.Diagnostics.Debug.WriteLine("Null string table");
 foreach(WorksheetPart part in d.WorkbookPart.WorksheetParts) {
  foreach(SheetData sheet in part.Worksheet.Elements<SheetData>()) {
   bool added = false;
   foreach(Row r in sheet.Elements<Row>()) {
    foreach(Cell c in r.Elements<Cell>()) {
     if(c.DataType != null) {
      string v = c.CellValue.Text;
      if(v != null && c.DataType.Value == CellValues.SharedString) {
       var tableEntry = stringTable.SharedStringTable.ElementAt(int.Parse(v));
       if(tableEntry != null) {
        v = tableEntry.InnerText;
       }
      }
      if(v != null) {
       if(added) b.Append('\t');
       b.Append(v);
       added = true;
      }
     }
    }
    if(added) b.AppendLine();
   }
  }
 }
}
return b.ToString();

我在网上找到的例子没有提到共享字符串表 - 当我意识到没有输出字符串数据时,我发现了它。

我还应该知道其他问题吗?

对代码的其他批评总是受欢迎。

1 个答案:

答案 0 :(得分:1)

从单元格中提取实际数据有一些棘手的部分。有时它存储在那里(数字,日期,内联字符串),有时它引用SharedStringTable。我已经浏览了很多功能,这就是我提出的(有些复制,有些是我的)。

之后,你应该能够轻松地将它滑入你的代码中

foreach(r.Elements()中的Cell c){

像这样

string v = GetValueFromCell(c,d.WorkbookPart);

        /// <summary>
        /// Return si value based on xml cell id number
        /// </summary>
        /// <param name="workbookPart"></param>
        /// <param name="id"></param>
        /// <returns>SharedStringItem for interpretation</returns>
        public static SharedStringItem GetSharedStringItemById(WorkbookPart workbookPart, int id)
        {
            return workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(id);
        }

        /// <summary>
        /// Return value from the cell based on the cell's information (innards and/or id)
        /// </summary>
        /// <param name="cell">spreadhseet cell</param>
        /// <param name="workbookPart">work book from uploaded file</param>
        /// <returns>string value of the cell</returns>
        public static string GetValueFromCell(Cell cell, WorkbookPart workbookPart)
        {
            int id;
            string cellValue = cell.InnerText;

            if (cellValue.Trim().Length > 0)
            {
                if (cell.DataType != null)
                {
                    switch (cell.DataType.Value)
                    {
                        case CellValues.SharedString:

                            Int32.TryParse(cellValue, out id);
                            SharedStringItem item = GetSharedStringItemById(workbookPart, id);
                            if (item.Text != null)
                            {
                                cellValue = item.Text.Text;
                            }
                            else if (item.InnerText != null)
                            {
                                cellValue = item.InnerText;
                            }
                            else if (item.InnerXml != null)
                            {
                                cellValue = item.InnerXml;
                            }
                            break;

                        case CellValues.Boolean:
                            switch (cellValue)
                            {
                                case "0":
                                    cellValue = "FALSE";
                                    break;
                                default:
                                    cellValue = "TRUE";
                                    break;
                            }
                            break;
                    }
                }

                else
                {
                    int excelDate;
                    if (Int32.TryParse(cellValue, out excelDate))
                    {

                        var styleIndex = (int)cell.StyleIndex.Value;

                        var cellFormats = workbookPart.WorkbookStylesPart.Stylesheet.CellFormats;
                        var numberingFormats = workbookPart.WorkbookStylesPart.Stylesheet.NumberingFormats;
                        var cellFormat = (CellFormat)cellFormats.ElementAt(styleIndex);

                        if (cellFormat.NumberFormatId != null)
                        {

                            var numberFormatId = cellFormat.NumberFormatId.Value;
                            var numberingFormat = numberingFormats.Cast<NumberingFormat>().SingleOrDefault(f => f.NumberFormatId.Value == numberFormatId);

                            if (numberingFormat != null && numberingFormat.FormatCode.Value.Contains("/yy")) //TODO here i should think of locales
                            {
                                DateTime dt = DateTime.FromOADate(excelDate);
                                cellValue = dt.ToString("MM/dd/yyyy");
                            }
                        }
                    }
                }
            }
            return cellValue;
        }