我需要从xlsx文件中提取文本(放入数据库的全文索引)。 我使用以下代码:
using(SpreadsheetDocument d = SpreadsheetDocument.Open(stream, false)) {
// Load the shared strings table.
SharedStringTablePart stringTable =
d.WorkbookPart.GetPartsOfType<SharedStringTablePart>()
.FirstOrDefault();
if(stringTable == null) System.Diagnostics.Debug.WriteLine("Null string table");
foreach(WorksheetPart part in d.WorkbookPart.WorksheetParts) {
foreach(SheetData sheet in part.Worksheet.Elements<SheetData>()) {
bool added = false;
foreach(Row r in sheet.Elements<Row>()) {
foreach(Cell c in r.Elements<Cell>()) {
if(c.DataType != null) {
string v = c.CellValue.Text;
if(v != null && c.DataType.Value == CellValues.SharedString) {
var tableEntry = stringTable.SharedStringTable.ElementAt(int.Parse(v));
if(tableEntry != null) {
v = tableEntry.InnerText;
}
}
if(v != null) {
if(added) b.Append('\t');
b.Append(v);
added = true;
}
}
}
if(added) b.AppendLine();
}
}
}
}
return b.ToString();
我在网上找到的例子没有提到共享字符串表 - 当我意识到没有输出字符串数据时,我发现了它。
我还应该知道其他问题吗?
对代码的其他批评总是受欢迎。
答案 0 :(得分:1)
从单元格中提取实际数据有一些棘手的部分。有时它存储在那里(数字,日期,内联字符串),有时它引用SharedStringTable。我已经浏览了很多功能,这就是我提出的(有些复制,有些是我的)。
之后,你应该能够轻松地将它滑入你的代码中foreach(r.Elements()中的Cell c){
像这样string v = GetValueFromCell(c,d.WorkbookPart);
/// <summary>
/// Return si value based on xml cell id number
/// </summary>
/// <param name="workbookPart"></param>
/// <param name="id"></param>
/// <returns>SharedStringItem for interpretation</returns>
public static SharedStringItem GetSharedStringItemById(WorkbookPart workbookPart, int id)
{
return workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(id);
}
/// <summary>
/// Return value from the cell based on the cell's information (innards and/or id)
/// </summary>
/// <param name="cell">spreadhseet cell</param>
/// <param name="workbookPart">work book from uploaded file</param>
/// <returns>string value of the cell</returns>
public static string GetValueFromCell(Cell cell, WorkbookPart workbookPart)
{
int id;
string cellValue = cell.InnerText;
if (cellValue.Trim().Length > 0)
{
if (cell.DataType != null)
{
switch (cell.DataType.Value)
{
case CellValues.SharedString:
Int32.TryParse(cellValue, out id);
SharedStringItem item = GetSharedStringItemById(workbookPart, id);
if (item.Text != null)
{
cellValue = item.Text.Text;
}
else if (item.InnerText != null)
{
cellValue = item.InnerText;
}
else if (item.InnerXml != null)
{
cellValue = item.InnerXml;
}
break;
case CellValues.Boolean:
switch (cellValue)
{
case "0":
cellValue = "FALSE";
break;
default:
cellValue = "TRUE";
break;
}
break;
}
}
else
{
int excelDate;
if (Int32.TryParse(cellValue, out excelDate))
{
var styleIndex = (int)cell.StyleIndex.Value;
var cellFormats = workbookPart.WorkbookStylesPart.Stylesheet.CellFormats;
var numberingFormats = workbookPart.WorkbookStylesPart.Stylesheet.NumberingFormats;
var cellFormat = (CellFormat)cellFormats.ElementAt(styleIndex);
if (cellFormat.NumberFormatId != null)
{
var numberFormatId = cellFormat.NumberFormatId.Value;
var numberingFormat = numberingFormats.Cast<NumberingFormat>().SingleOrDefault(f => f.NumberFormatId.Value == numberFormatId);
if (numberingFormat != null && numberingFormat.FormatCode.Value.Contains("/yy")) //TODO here i should think of locales
{
DateTime dt = DateTime.FromOADate(excelDate);
cellValue = dt.ToString("MM/dd/yyyy");
}
}
}
}
}
return cellValue;
}