Question

我对C＃比较陌生，对于我得到的OOM错误我感到很困惑。我正在尝试创建一个稀疏矩阵，因此收集三元组（行索引，列索引，值）。在进行for循环时，最终发生的事情是该进程使用的实际物理内存（根据资源管理器，我认为Windows术语“工作集”）相对固定在3.5GB左右。但是，提交（我相信是虚拟内存）不断增加和增加，直到达到提交限制，我的程序因OOM错误而崩溃。

相关代码如下：

public SimMatrix(string sparseMethod, string simMethod, Phrases phrases, DistrFeature features, int topK) {
        List<int> rows = new List<int>(phrases.uniquePhraseCount*topK);
        List<int> cols = new List<int>(phrases.uniquePhraseCount*topK);
        List<double> vals = new List<double>(phrases.uniquePhraseCount*topK);
        if (sparseMethod.Equals("invIdx")) {
            List<int> nonzeros = new List<int>(features.inverted_idx.Count());
            List<int> neighbors = new List<int>(phrases.uniquePhraseCount);
            List<double> simVals = new List<double>(phrases.uniquePhraseCount);
            List<int> sortedIdx = new List<int>(phrases.uniquePhraseCount);
            List<double> sortedSim = new List<double>(phrases.uniquePhraseCount);                
            for (int i = 0; i < phrases.uniquePhraseCount; i++) { //loop through all phrases
                using (SparseDoubleArray row = phrases.feature_values.GetRowSparse(i)) 
                {
                    if (phrases.feature_values.RowLength(i) > 0) { //i.e., at least one feature fired for phrase                                                    
                        nonzeros = (from pmi in row.Elements select pmi.IndexList[1]).ToList();                            
                        neighbors = generateNeighbors(nonzeros, features.inverted_idx);
                        foreach (int neighbor in neighbors)
                            simVals.Add(cosineSimilarity(row, phrases.feature_values.GetRowSparse(neighbor)));
                        var sortedIdxSim = neighbors.Zip(simVals, (a, b) => new { idx = a, sim = b }).OrderByDescending(pair => pair.sim);                            
                        sortedIdx = sortedIdxSim.Select(pair => pair.idx).ToList();                            
                        sortedSim = sortedIdxSim.Select(pair => pair.sim).ToList();                            
                        int topN = (sortedIdxSim.Count() < topK) ? sortedIdxSim.Count() : topK;
                        rows.AddRange(Enumerable.Repeat(i, topN).ToList());
                        cols.AddRange(sortedIdx.Take(topN).ToList());
                        vals.AddRange(sortedSim.Take(topN).ToList());
                        nonzeros.Clear();
                        neighbors.Clear();
                        simVals.Clear();
                        sortedIdx.Clear();
                        sortedSim.Clear();
                    }
                    else { //just add self similarity
                        rows.Add(i);
                        cols.Add(i);
                        vals.Add(1);
                    }
                    Console.WriteLine("{0} phrases done", i + 1);                         
                }
            }
        }
        else { Console.WriteLine("Sorry, no other sparsification method implemented thus far"); }
        simMat = new SparseDoubleArray(phrases.uniquePhraseCount, phrases.uniquePhraseCount, rows, cols, vals); 
    }

    static private List<int> generateNeighbors(List<int> idx, Dictionary<int, List<int>> inverted_idx) {
        List<int> neighbors = new List<int>();
        foreach (int feature in idx) {
            neighbors.AddRange(inverted_idx[feature]);
            neighbors = neighbors.Distinct().ToList();
        }            
        return neighbors;            
    }

    static public double cosineSimilarity(SparseDoubleArray profile1, SparseDoubleArray profile2) {
        double numerator = profile1.Dot(profile2);
        double norm1 = profile1.Norm();
        double norm2 = profile2.Norm();
        double cos_sim = numerator / (norm1 * norm2);
        if (cos_sim > 0)
            return cos_sim;
        else
            return 0;            
    }

请注意，代码使用了一些内部库（例如，SparseDoubleArray对象）。基本要点是我循环遍历所有条目（由i索引），并且对于每个条目，我找出非零列索引，通过“generateNeighbors”函数从中生成潜在邻居列表。一旦我有一个候选邻居列表，我计算每个潜在邻居的余弦相似度。然后我同时对索引和相似度值进行排序，选择topN索引/相似度值，并将这些值与索引i（对应于行索引）一起添加到维护稀疏矩阵索引和值的列表中。

代码在通过for循环时看似不确定。有时它会在i = 25,000处断开，有时在i = 2000处。我甚至没有进入初始化稀疏矩阵的阶段。

任何见解或帮助将不胜感激。

更新（2013年6月10日）

由于提供的响应，我已经设法大幅减少代码的已提交内存。下面是更新的代码，您会注意到它与问题的答案不完全相同，我将详细说明需要更改的内容。

public SimMatrix(string sparseMethod, string simMethod, Phrases phrases, DistrFeature features, int topK) {
        List<int> rows = new List<int>(phrases.uniquePhraseCount*topK);
        List<int> cols = new List<int>(phrases.uniquePhraseCount*topK);
        List<double> vals = new List<double>(phrases.uniquePhraseCount*topK);
        if (sparseMethod.Equals("invIdx")) {
            for (int i = 0; i < phrases.uniquePhraseCount; i++) { //loop through all phrases
                using (SparseDoubleArray row = phrases.feature_values.GetRowSparse(i)) 
                {
                    if (phrases.feature_values.RowLength(i) > 0) { //i.e., at least one feature fired for phrase                                                                                
                        IEnumerable<int> nonzeros = from pmi in row.Elements select pmi.IndexList[1];
                        IEnumerable<int> neighbors = nonzeros.SelectMany(x => features.inverted_idx[x]).Distinct();                            
                        IEnumerable<double> simVals = neighbors.Select(x => cosineSimilarity(row, x, phrases));
                        var sortedIdxSim = neighbors.Zip(simVals, (a, b) => new { idx = a, sim = b }).OrderByDescending(pair => pair.sim).ToList();
                        //IEnumerable<int> sortedIdx = sortedIdxSim.Select(pair => pair.idx);                                                        
                        //IEnumerable<double> sortedSim = sortedIdxSim.Select(pair => pair.sim);                                                        
                        int sortedIdxSimCount = sortedIdxSim.Count;
                        int topN = (sortedIdxSimCount < topK) ? sortedIdxSimCount : topK;                            
                        rows.AddRange(Enumerable.Repeat(i, topN));
                        cols.AddRange(sortedIdxSim.Take(topN).Select(pair => pair.idx));
                        vals.AddRange(sortedIdxSim.Take(topN).Select(pair => pair.sim)); 
                    }
                    else { //just add self similarity
                        rows.Add(i);
                        cols.Add(i);
                        vals.Add(1);
                    }
                    if ((i % 1000) == 0)
                        Console.WriteLine("{0} phrases done;", i + 1);                         
                }
            }
        }
        else { Console.WriteLine("Sorry, no other sparsification method implemented thus far"); }
        simMat = new SparseDoubleArray(phrases.uniquePhraseCount, phrases.uniquePhraseCount, rows, cols, vals); 
    }

    static public double cosineSimilarity(SparseDoubleArray profile1, int profile2idx, Phrases phrases) {
        using (SparseDoubleArray profile2 = phrases.feature_values.GetRowSparse(profile2idx)) {
            double numerator = profile1.Dot(profile2);
            double norm1 = profile1.Norm();
            double norm2 = profile2.Norm();
            double cos_sim = numerator / (norm1 * norm2);
            if (cos_sim > 0)
                return cos_sim;
            else
                return 0;
        }
    }

首先，我被迫将var sortedIdxSim从IEnumerable转换为List;这是因为我a）需要知道这个列表中的元素数量，似乎在IEnumerable上调用.Count()会消除IEnumerable中保存的数据？似乎在.Take()上调用IEnumerable<int> sortedIdx（例如，根据Gjeltema的原始建议）会清除IEnumerable<double> sortedSim中的数据。这是由于延期执行吗？我不太熟悉懒惰的评估/延迟执行，所以也许我误解了我需要在这里做什么。

但是，老实说，目前的变化已大大减少了我的承诺内存，使程序实际上可以完成，所以非常感谢！如果有人可以帮我澄清上述问题，那就太棒了。

Answer 1

一个问题是你早期宣布了一堆临时集合，并用看起来大小远远超出实际需要的大小来初始化它们。然后你继续通过为它们分配其他值来丢弃你分配给它们的内存。这可能不是你的主要问题，因为你几乎立即丢弃了初始化的集合（释放垃圾收集），但我确信它没有帮助。

例如，您像这样初始化neighbors：

List<int> neighbors = new List<int>(phrases.uniquePhraseCount);

然后在neighbors的第一次使用时，为它分配一个新的集合，丢弃你为它分配的内存：

neighbors = generateNeighbors(nonzeros, features.inverted_idx);

所以，首先，你要想要摆脱所有那些你没有使用的早期初始化，这可能会占用相当大的一块内存。

接下来你要使用Linq语句很多，大概是为了便于获取你想要的数据，这很好。但是，你没有利用Linq的一个功能（特别是在内存不足的情况下）通过在所有内容上调用.ToList()而不是延迟加载它。

我已经检查了你的功能并删除了我上面提到的初始化，并且也改变了合理的延迟加载（即删除了.ToList()次调用）。

（请注意，我已离开.ToList()调用neighbors初始化，因为你不会因为不这样做而获得太多收益（我觉得很难说neighbors有多大generateNeighbors()如果您仍然遇到内存问题，我建议您将IEnumerable的返回类型更改为.ToList()并删除其中的// Side note - your simMethod argument doesn't seem to be used. public SimMatrix(string sparseMethod, string simMethod, Phrases phrases, DistrFeature features, int topK) { List<int> rows = new List<int>(phrases.uniquePhraseCount * topK); List<int> cols = new List<int>(phrases.uniquePhraseCount * topK); List<double> vals = new List<double>(phrases.uniquePhraseCount * topK); if (sparseMethod.Equals("invIdx")) { for (int i = 0; i < phrases.uniquePhraseCount; i++) { //loop through all phrases using (SparseDoubleArray row = phrases.feature_values.GetRowSparse(i)) { if (phrases.feature_values.RowLength(i) > 0) { //i.e., at least one feature fired for phrase // Declare your temporary collections when they're initialized IEnumerable<int> nonzeros = row.Elements.Select(pmi => pmi.IndexList[1]); var neighbors = generateNeighbors(nonzeros, features.inverted_idx); IEnumerable<double> simVals = neighbors.Select(x => cosineSimilarity(row, phrases.feature_values.GetRowSparse(x))); var sortedIdxSim = neighbors.Zip(simVals, (a, b) => new { idx = a, sim = b }).OrderByDescending(pair => pair.sim); IEnumerable<int> sortedIdx = sortedIdxSim.Select(pair => pair.idx); IEnumerable<double> sortedSim = sortedIdxSim.Select(pair => pair.sim); int sortedInxSimCount = sortedIdxSim.Count(); int topN = (sortedInxSimCount < topK) ? sortedInxSimCount : topK; rows.AddRange(Enumerable.Repeat(i, topN)); cols.AddRange(sortedIdx.Take(topN)); vals.AddRange(sortedSim.Take(topN)); } else { //just add self similarity rows.Add(i); cols.Add(i); vals.Add(1); } Console.WriteLine("{0} phrases done", i + 1); } } } else { Console.WriteLine("Sorry, no other sparsification method implemented thus far"); } simMat = new SparseDoubleArray(phrases.uniquePhraseCount, phrases.uniquePhraseCount, rows, cols, vals); } static private List<int> generateNeighbors(IEnumerable<int> idx, Dictionary<int, List<int>> inverted_idx) { // Doing it this way will reduce memory usage since you won't be creating a bunch of temporary // collections, adding them to an existing collection, then creating a brand new collection // from it that is smaller... I think that may have been spiking your memory usage quite a bit. return inverted_idx.Where(x => idx.Contains(x.Key)).SelectMany(x => x.Value).Distinct().ToList(); }并尝试此操作。

这会大大降低您的峰值内存使用量。如果您仍然遇到内存问题，请回过头来更新您的问题 - 我可能需要查看更多代码并获取有关此时运行此类型号的更多信息。

rows

最后一点注意事项 - 您似乎正在添加看起来相同的值至少cols，可能还有vals和phrases.uniquePhraseCount * topK。您是否需要在这些集合中重复值（看起来您可能实际上）？如果他们没有超过他们的初始化容量（SparseDoubleArray），这不是真正的问题，但如果他们这样做，那么这可能是你的主要内存问题。

编辑：我刚注意到别的东西。什么是GetRowSparse()类，以及using做什么？

具体来说，我想知道为什么你在课堂上做using (SparseDoubleArray row = phrases.feature_values.GetRowSparse(i))，就像这样：

simVals.Add(cosineSimilarity(row, phrases.feature_values.GetRowSparse(neighbor)));

阻止它在块完成后释放其本机资源。但是，你也在这里叫它：

Dispose()

但不在其上调用using。那个职能和那个班级发生了什么？是{{1}}不需要，还是实际需要？如果需要，那可能是你的内存泄漏。

.NET内存管理 - 提交增长导致OOM

1 个答案: