我有一个excel
(.xls
)文件需要解析为.csv
文件。我正在为NPOI
使用c#
库的最新稳定版本。问题是我正在变得粗糙CSV
而不是规范行大小。
What the data in excel file looks like
输出csv
文件是:
"姓""中间名""名字""普农"
"约翰"" L"" Doe的"" 555-555-5555"
"小""丁""蝙蝠"
"罗杰"" d""兔"" 123-456-7890"
我想要它发生的是在第二个数据行的末尾添加一个额外的分隔符(在" Bat"之后),如下所示:
"姓""中间名""名字""普农"
"约翰"" L"" Doe的"" 555-555-5555"
"小""丁""蝙蝠",
"罗杰"" d""兔"" 123-456-7890"
这是我的代码:
public override bool ParseFile()
{
FileStream iFile = new FileStream(InputFileName, FileMode.Open);
HSSFWorkbook wb = new HSSFWorkbook(iFile);
ExcelExtractor extractor = new ExcelExtractor(wb);
extractor.IncludeBlankCells = true;
bool result = true;
if (AllWorksheets)
{
for (int i = 0; i < wb.NumberOfSheets; i++)
result = result && ParseWorksheet(wb, i);
}
else
{
result = ParseWorksheet(wb, 0);
}
return result;
}
protected char c = '"';
public static string FormatValue(string s, bool AddQuotes, char quoteChar)
{
if (AddQuotes)
{
return quoteChar + s + quoteChar;
}
return s;
}
private bool ParseWorksheet(HSSFWorkbook wb, int SheetIndex)
{
bool result = true;
HSSFSheet sheet = (HSSFSheet)wb.GetSheetAt(SheetIndex);
if (sheet.FirstRowNum == sheet.LastRowNum && sheet.LastRowNum == 0) return result;
System.IO.StreamWriter sw = new StreamWriter(OutputFileName, true);
for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
{
string OutputRow = String.Empty;
HSSFRow row = (HSSFRow)sheet.GetRow(i);
int Column = 0;
int MaxCol = 0;
int temp = 0;
for (int j = 0; j < row.LastCellNum; j++)
{
temp = row.LastCellNum;
if (temp > MaxCol)
{
MaxCol = temp;
}
}
for (int j = 0; j < MaxCol; j++)
{
if (j == row.Cells[Column].ColumnIndex)
{
switch (row.Cells[Column].CellType)
{
case NPOI.SS.UserModel.CellType.Boolean:
OutputRow += FormatValue(row.Cells[Column].BooleanCellValue.ToString(), AddQuotes, c) + Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.Formula:
OutputRow += FormatValue(row.Cells[Column].CachedFormulaResultType.ToString(), AddQuotes, c) + Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.Numeric:
OutputRow += FormatValue((NPOI.SS.UserModel.DateUtil.IsCellDateFormatted(row.Cells[Column]) ? row.Cells[Column].DateCellValue.ToShortDateString() : row.Cells[Column].NumericCellValue.ToString()), AddQuotes, c) + Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.Blank:
OutputRow += Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.String:
OutputRow += FormatValue(row.Cells[Column].StringCellValue.ToString().Replace('\n', ' ').TrimEnd(), AddQuotes, c) + Delimiter.ToString();//replace the new line character to space due to formatting issue.
break;
default:
result = false;
break;
}
Column++;
}
else
{
OutputRow += Delimiter.ToString();
}
}
OutputRow = OutputRow.Remove(OutputRow.Length - 1);
sw.WriteLine(OutputRow);
}
sw.Flush();
sw.Close();
return result;
}`
我们非常感谢任何建议。
答案 0 :(得分:2)
这里有一些问题导致了这个问题。
首先,您要重新计算每一行的MaxCol
。如果您想要一个非参差不齐的右边缘,那么您需要先找到MaxCol
所有行,然后然后生成输出。
其次,您正在使用row.Cells[]
尝试获取该行的特定单元格。 Cells[]
忽略空值。因此,如果您碰巧在行中某处有一个空白单元格,那么所有剩余的值都将向左移动,并且数组的长度将小于MaxCol
。如果您尝试在至少有一个空值的行上访问row.Cells[MaxCol - 1]
,则会导致异常
此问题的解决方案是使用row.GetCell(index)
方法。此方法返回列index
(从0开始)的单元格,如果该单元格为空,则返回null
。使用起来要简单得多,并且允许您消除代码中的特殊逻辑,该逻辑根据您的循环索引ColumnIndex
检查当前单元格的j
,以确保您获得的单元格是真的在你期望的专栏中。
作为一个额外的建议,我建议在内循环内只检索一次当前单元格并将其分配给变量,而不是多次重新检索它。这将使您的代码更高效,更易于阅读。
以下是ParseWorksheet
方法的修订代码,其中包含以上所有更改:
private bool ParseWorksheet(HSSFWorkbook wb, int SheetIndex)
{
bool result = true;
HSSFSheet sheet = (HSSFSheet)wb.GetSheetAt(SheetIndex);
if (sheet.FirstRowNum == sheet.LastRowNum && sheet.LastRowNum == 0) return result;
StreamWriter sw = new StreamWriter(OutputFileName, true);
int MaxCol = 0;
for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
{
HSSFRow row = (HSSFRow)sheet.GetRow(i);
MaxCol = Math.Max(MaxCol, row.LastCellNum);
}
for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
{
string OutputRow = String.Empty;
HSSFRow row = (HSSFRow)sheet.GetRow(i);
for (int j = 0; j < MaxCol; j++)
{
HSSFCell cell = (HSSFCell)row.GetCell(j);
if (cell != null)
{
switch (cell.CellType)
{
case NPOI.SS.UserModel.CellType.Boolean:
OutputRow += FormatValue(cell.BooleanCellValue.ToString(), AddQuotes, c) + Delimiter;
break;
case NPOI.SS.UserModel.CellType.Formula:
OutputRow += FormatValue(cell.CachedFormulaResultType.ToString(), AddQuotes, c) + Delimiter;
break;
case NPOI.SS.UserModel.CellType.Numeric:
OutputRow += FormatValue((NPOI.SS.UserModel.DateUtil.IsCellDateFormatted(cell) ? cell.DateCellValue.ToShortDateString() : cell.NumericCellValue.ToString()), AddQuotes, c) + Delimiter;
break;
case NPOI.SS.UserModel.CellType.Blank:
OutputRow += Delimiter;
break;
case NPOI.SS.UserModel.CellType.String:
OutputRow += FormatValue(cell.StringCellValue.ToString().Replace('\n', ' ').TrimEnd(), AddQuotes, c) + Delimiter; //replace the new line character to space due to formatting issue.
break;
default:
result = false;
break;
}
}
else
{
OutputRow += Delimiter;
}
}
OutputRow = OutputRow.Remove(OutputRow.Length - 1);
sw.WriteLine(OutputRow);
}
sw.Flush();
sw.Close();
return result;
}