Question

我正在编写一种方法，可以在多个分区上平均复制文件以进行处理。我目前正在做的似乎工作正常，但我觉得可能有更好的方法来做到这一点。

我对此过程有以下问题：

有没有比我正在使用的方法（你知道的）更好的方法，按大小在分区之间均匀分配文件？
因为我要复制到多个服务器上的多个分区，为文件副本实现多线程会对我有利吗，还是我仍然会受到传输这些文件的磁盘输出的限制？

我创建相等文件分组的方法如下：

    /// <summary>
    /// Distributes a list of files into groups based on their size.
    /// </summary>
    /// <param name="files">The list of files to distribute.</param>
    /// <param name="partitionCount">The number of partitions to distribute across.</param>
    /// <returns>A balanced array of file lists for each partition.</returns>
    public List<SourceFile>[] Distribute(List<SourceFile> files, int partitionCount)
    {
        long totalSize = files.Sum(sf => sf.Size);

        long groupGoal = totalSize / partitionCount;

        List<SourceFile> sourceFilesSorted = files.OrderByDescending(sf => sf.Size).ToList();
        List<SourceFile>[] groups = Enumerable.Range(0, partitionCount).Select(l => new List<SourceFile>()).ToArray();

        int nGroup = 0, attempt = 1;
        long acceptedGoal = groupGoal;
        while (sourceFilesSorted.Count > 0)
        {
            WriteLine("Attempt {0} at initial file grouping, tolerance {1}...", attempt++, acceptedGoal);
            bool anySuccess = false;
            foreach (SourceFile sf in sourceFilesSorted.ToList())
            {
                if (groups[nGroup].Count == 0)
                {
                    groups[nGroup].Add(sf);
                    sourceFilesSorted.Remove(sf);
                    anySuccess = true;
                    continue;
                }

                bool satisfied = false;
                while (!satisfied && nGroup < groups.Length)
                {
                    if (groups[nGroup].Sum(gf => gf.Size) + sf.Size <= acceptedGoal)
                    {
                        groups[nGroup].Add(sf);
                        sourceFilesSorted.Remove(sf);
                        anySuccess = true;
                        satisfied = true;
                    }
                    if (!satisfied)
                        nGroup++;
                }

                if (++nGroup >= groups.Length)
                {
                    nGroup = 0;
                }
            }

            if (sourceFilesSorted.Count > 0)
                groups = groups.OrderBy(g => g.Sum(gf => gf.Size)).ToArray();

            if (!anySuccess)
                acceptedGoal += groupGoal;
        }

        groups = groups.OrderByDescending(g => g.Sum(gf => gf.Size)).ToArray();

        attempt = 1;
        acceptedGoal = groupGoal;

        bool hasMove = true;

        while (hasMove)
        {
            WriteLine("Attempt {0} at moving larger group files into smaller groups...", attempt);

            WriteLine("There are {0} groups above tolerance: {1}", groups.Where(g => (g.Sum(gf => gf.Size) > acceptedGoal)).Count(), acceptedGoal);

            // Begin moving files in groups where acceptable.
            List<SourceFile>[] move = Enumerable.Range(0, groups.Length).Select(l => new List<SourceFile>()).ToArray();
            for (int i = 0; i < groups.Length && i < move.Length; i++)
            {
                // WriteLine("Group {0} sum: {1}", i + 1, groups[i].Sum(sf => sf.Size));

                if (groups[i].Sum(sf => sf.Size) <= acceptedGoal)
                    continue;

                foreach (SourceFile file in groups[i])
                {
                    if (groups.Where(g => (g.Sum(gf => gf.Size) + file.Size <= acceptedGoal)).Any())
                    {
                        move[i].Add(file);
                    }
                }
            }

            long moves = move.Sum(m => m.Count);
            hasMove = move.Any(m => m.Any());
            WriteLine("Found {0} moves, {1}", moves, hasMove ? "attempting to redistribute..." : "process complete.");
            for (int i = 0; i < groups.Length; i++)
            {
                for (int j = 0; j < move.Length; j++)
                {
                    foreach (SourceFile file in move[j].ToList())
                    {
                        if (groups[i].Sum(sf => sf.Size) + file.Size <= acceptedGoal)
                        {
                            groups[i].Add(file);
                            groups[j].Remove(file);
                            move[j].Remove(file);
                        }
                    }
                }
            }

            if (!hasMove && acceptedGoal == groupGoal)
            {
                var acceptedGroups = groups.Where(g => (g.Sum(gf => gf.Size) <= acceptedGoal));
                acceptedGoal = acceptedGroups.Sum(g => g.Sum(gf => gf.Size)) / acceptedGroups.Count();
                WriteLine("Lowering tolerance to {0} for {1} groups, continue distribution...", acceptedGoal, acceptedGroups.Count());
                hasMove = true;
            }
        }

        return groups;
    }

首先，我指定acceptedGoal，这是我希望在每台服务器上实现的目标大小。这只是所有文件大小的平均值，可以创建完美的分布。

在此之后，我按大小，降序排序文件列表，然后我开始将它们添加到每个组中，当添加将使总大小超过acceptedGoal时跳过每个组。

一旦迭代中没有成功添加，acceptedGoal就会增加初始值，这基本上只是增加了每轮的容差。在每次迭代开始之前，组始终从最低到最高排序，以确保将新文件添加到当前最小组，以使整体差异尽可能低。

更新：我进行了更深入的研究，现在又第二次遍历列表。这次，它计算一个新的较低的tolerance，它是已经低于初始接受容差的组的平均值。

该过程将继续尝试将文件从超出容差的组中移出到其下方的组中。

到目前为止，我已经将此降低到目标目标之外的非常低的差异，但我仍然不确定是否有更好的方法。

更新2 ：感谢@Enigmativity我能够将这一切再次重构为一个非常干净的IEnumerable方法：

    /// <summary>
    /// Distributes a list of files into groups based on their size.
    /// </summary>
    /// <param name="files">The list of files to distribute.</param>
    /// <param name="partitionCount">The number of partitions to distribute across.</param>
    /// <returns>A balanced array of file lists for each partition.</returns>
    public IEnumerable<List<SourceFile>> Distribute(List<SourceFile> files, int partitionCount)
    {
        // Calculate the max fileSize tolerance per partition (the "perfect" distribution size across each disk). 
        long tolerance = files.Sum(sf => sf.Size) / partitionCount;

        List<List<SourceFile>> groups = Enumerable.Range(0, partitionCount).Select(l => new List<SourceFile>()).ToList();

        // Process each file, large to small.
        foreach(var file in files.OrderByDescending(sf => sf.Size))
        {
            // Add file to the smallest current group.
            groups.OrderBy(g => g.Sum(f => f.Size)).First().Add(file);

            // If this group exceeds tolerance, return it now so we can begin processing immediately.
            List<List<SourceFile>> returnGroups = groups.Where(g => g.Sum(sf => sf.Size) > tolerance).ToList();
            foreach (var retGroup in returnGroups)
            {
                groups.Remove(retGroup);
                yield return retGroup;
            }
        }
        // Remember to return the rest of the groups, large to small.
        foreach(var retGroup in groups.OrderByDescending(g => g.Sum(sf => sf.Size)))
            yield return retGroup;
    }

现在我计划迭代这个列表，并在创建列表时在每个分区上执行一个副本。

我仍然很好奇多线程是否有助于更快地处理，因为这是在服务器上的多个分区上复制的。 I / O是否仍受其他复制进程的限制，因为磁盘只需要读取数据并通过网络将其发送到其他分区。从这里开始，另一个分区将利用自己磁盘上的写入速度（我认为），这让我相信多线程是一个好主意。这听起来不错还是我离开了？

如果有人有任何资源可以研究，那也会非常感激。我在网上找不到太多关于我真正理解的话题。

Answer 1

我认为这可以满足您的需求：

public List<SourceFile>[] Distribute(List<SourceFile> files, int partitionCount)
{
    List<SourceFile> sourceFilesSorted =
        files
            .OrderByDescending(sf => sf.Size)
            .ToList();

    List<SourceFile>[] groups =
        Enumerable
            .Range(0, partitionCount)
            .Select(l => new List<SourceFile>())
            .ToArray();

    foreach (var f in files)
    {
        groups
            .Select(grp => new { grp, size = grp.Sum(x => x.Size) })
            .OrderBy(grp => grp.size)
            .First()
            .grp
            .Add(f);
    }

    return groups;
}

循环效率相当低，但10,000个文件的结果会在不到一秒的时间内恢复，所以我希望它足够快。如果需要，创建一个正在运行的组大小数组并不是很难，但如果不需要效率，那只会使代码复杂化。

如何在多个分区/服务器上平均快速分发文件？

1 个答案: