我正在开发一个将pdf文档转换为excel文件的程序。有大约1000个pdf文件要转换。但是我在处理第234个文件时遇到了这个错误。这是重要的代码。
private void getFullFileContent()
{
Excel.Application objExcel = new Excel.Application();
objExcel.Visible = true;
Excel.Workbook objBook = objExcel.Workbooks.Add(System.Reflection.Missing.Value);
Excel.Worksheet objSheet;
Excel.Range objRange=null;
DateTime endDate, startDate=System.DateTime.Now;
string[] pdfFiles = Directory.GetFiles(folderBrowserDialog1.SelectedPath, "*.pdf");
for (int i = 1; i <= pdfFiles.Length; i++)
{
objSheet = (Excel.Worksheet)objBook.Worksheets.get_Item(1);
string bColumn = "B" + i,aColumn="A"+i;
objRange = objSheet.get_Range(aColumn, System.Reflection.Missing.Value);
objRange.set_Value(System.Reflection.Missing.Value, pdfFiles[i-1].Substring(pdfFiles[i-1].LastIndexOf('\\') + 1));
objRange = objSheet.get_Range(bColumn, System.Reflection.Missing.Value);
objRange.set_Value(System.Reflection.Missing.Value, ConvertPdfToText(pdfFiles[i - 1]));
label4.Text = pdfFiles[i].Substring(pdfFiles[i].LastIndexOf('\\')+1);
label6.Text = "Remaining File: "+(pdfFiles.Length - i).ToString();
endDate = System.DateTime.Now;
label5.Text = "Total time: " + (endDate - startDate).Hours.ToString() + ":" + (endDate - startDate).Minutes.ToString() + ":" + (endDate - startDate).Seconds.ToString();
GC.Collect();
}
try
{
objBook.SaveAs("Training Data.xlsx");
MessageBox.Show("Your PDF files converted and printed into \"Training Data.xlsx\"");
}
catch (Exception)
{
throw;
}
}
private string ConvertPdfToText(string path)
{
StringBuilder text = new StringBuilder();
string fileName = path;
string strFileContent = "";
if (File.Exists(fileName))
{
PdfReader pdfReader = new PdfReader(fileName);
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
strFileContent += currentText;
}
pdfReader.Close();
}
return strFileContent;
}
答案 0 :(得分:0)
您的文件有多大? 也许您可以尝试使用像RedGate Ants或JetBrains dotTrace这样的内存分析器来确定内存泄漏的位置(使用较小的批处理时)。
这段代码也可以在getFullFileContent()的for循环之外:
objSheet = (Excel.Worksheet)objBook.Worksheets.get_Item(1)