Question

我有一个超过10,000页的PDF，我试图根据分隔页分割成较小的PDF。我当前的实现很有效，直到你开始一次抛出10k页。在大约第50次创建pdf（每个约100页）之后，它将开始显着减慢，并且在我得到OutOfMemoryException之前我的内存使用量跳升到大约2GB。我对内存管理的经验很少，但我做了很多研究。我只是因为时间敏感而在这里求助，所以如果我自己没有做过合理的研究，我会道歉。

我对原始PDF的初步阅读：

 var pdfDictionary = PDFHelper.ParsePDFByPage(_workItem.FileName);
        //Code behind
        public static Dictionary<int, string> ParsePDFByPage(string filePath)
        {
            var retVal = new Dictionary<int, string>();

            PdfReader reader = new PdfReader(filePath);
            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                retVal.Add(page, PdfTextExtractor.GetTextFromPage(reader, page, new StructuredTextExtractionStrategy()));
            }
            reader.Close();
            reader.Dispose();

            return retVal;
        }

阅读之后，我发现哪些页面是分隔符，并为每个需要从原始分割的页面范围创建HMPdf实例（在下面定义）

var pdfsToCreate= pdfDictionary.Where(x => x.Value.Contains("DELIMITER"));
var pdfList = new List<HMPdf>();
foreach (var item in pdfsToCreate) //pdfsToCreate = Dictionary<int,string> 
{
    //Parsing logic (most removed, just know that this part works fine)

    //After parsing, create new instance of HMPdf and add it to the list
    var pdf = new HMPdf(startPage, endPage, fileName);
    pdfList.Add(pdf);
}

解析后，创建PDF

foreach (var hmpdf in pdfList)
{
    //I've tried forcing the GC to collect after every 10 pdfs created
    string error = string.Empty;
    if (!hmpdf.TryCreate(sourcePath, destinationPath, out error))
    {
        throw new Exception("Error creating new PDF - " + error);
    }
}

HMPdf代码背后

public class HMPdf
{
    private string _path;
    private string _fileName;
    private PdfCopy _pdfCopy = null;
    private PdfReader _reader = null;
    private Document _sourceDocument = null;
    private PdfImportedPage _importedPage = null;
    private int _pageFrom;
    private int _pageTo;
    private FileStream _fileStream;

    public HMPdf(int pageFrom, int pageTo, string fileName)
    {
        _pageFrom = pageFrom;
        _pageTo = pageTo;
        _fileName = fileName;
    }

    public bool TryCreate(string sourcePath, string destinationPath, out string errorMessage)
    {        
        try
        {

            _reader = new PdfReader(sourcePath);
            _sourceDocument = new Document(_reader.GetPageSizeWithRotation(_pageFrom));
            _fileStream = new System.IO.FileStream(Path.Combine(destinationPath, _fileName.ToLower().Contains(".pdf") ? _fileName : _fileName + ".pdf"),
                    System.IO.FileMode.Create);
            _pdfCopy = new PdfCopy(_sourceDocument, _fileStream);
            _sourceDocument.Open();
            for (int i = _pageFrom; i <= _pageTo; i++)
            {
                _importedPage = _pdfCopy.GetImportedPage(_reader, i);
                _pdfCopy.AddPage(_importedPage);
                _importedPage = null;
            }
            return true;
        }
        catch (Exception ex)
        {
            errorMessage = ex.Message;
            return false;
        }
        finally
        {
            if (_reader != null)
            {
                _reader.Close();
                _reader.Dispose();
                _reader = null;
            }
            if (_sourceDocument != null)
            {
                _sourceDocument.Close();
                _sourceDocument.Dispose();
                _sourceDocument = null;
            }
            if (_pdfCopy != null)
            {
                _pdfCopy.Close();
                _pdfCopy.Dispose();
                _pdfCopy = null;
            }
            if (_fileStream != null)
            {
                _fileStream.Close();
                _fileStream.Dispose();
                _fileStream = null;
            }
        }
    }
}

正如您所知，我关闭/处置所有打开的文件流，读者等......（对吗？）。我已经尝试在每创建10个pdf后强制垃圾收集器运行，但它并没有清理任何东西。我已经运行了Telerik JustTrace，凭借我对内存管理的一点知识，我发现了一些事情。首先，在几个快照之间，有0个处理对象，在最后一个快照中，pdfList对象在内存中占用了近一GB。

我错过了一些完全明显的东西吗？

很抱歉这篇冗长的文章。

Answer 1

可能是你证明The Dangers of the Large Object Heap ......

尝试以减少内存使用的方式改进逻辑。

尽可能减少变量范围。即，不要创建不必要的类变量，而是将它们变为字段变量。

尝试类似下面的内容，这将减少变量的范围。

    public bool TryCreate(string sourcePath, string destinationPath, out string errorMessage)
    {
        try
        {

            using (var _reader = new PdfReader(sourcePath))
            {
                using (var _sourceDocument = new Document(_reader.GetPageSizeWithRotation(_pageFrom)))
                {
                    using (var _fileStream =
                        new System.IO.FileStream(
                            Path.Combine(destinationPath, _fileName.ToLower().Contains(".pdf") ? _fileName : _fileName + ".pdf"),
                            System.IO.FileMode.Create))
                    {
                        using (_pdfCopy = new PdfCopy(_sourceDocument, _fileStream))
                        {
                            _sourceDocument.Open();
                            for (int i = _pageFrom; i <= _pageTo; i++)
                            {
                                _importedPage = _pdfCopy.GetImportedPage(_reader, i);
                                _pdfCopy.AddPage(_importedPage);
                                _importedPage = null;
                            }
                        }
                    }
                }
            }
            return true;
        }

    }

分割10k页PDF时出现内存泄漏（iTextSharp PDF API）

1 个答案: