Question

我想以最大吞吐量处理某些文件。文件的路径保存在数据库中。我需要从数据库获取文件路径，将其状态更改为正在处理，对其进行处理，然后将其状态更改为已完成或失败。

当前，我分批获取文件（共100个文件），以减少查询的数量并并行处理（并行度为10）。但是这样一来，我在批处理结束时就失去了吞吐量。当批处理中剩余的文件少于10个时，并行度不再是10，它将降低。

这是我所拥有的：

private async Task CopyPendingFilesAsync(SourcePath sourcePath, Options options)
{
    var batchIndex = 0;
    while (true)
    {
        var fileBatch = _sourceFileService.GetSourceFileBatchBySourcePathId(
            sourcePath.Id, _dataSourceExportConfig.FileCopyBatchSize, Status.Pending);
        if (fileBatch.Count == 0)
            return;

        await SetInProgressStatusForBatch(fileBatch)
            .ConfigureAwait(false);

        fileBatch
            .AsParallel()
            .WithDegreeOfParallelism(_dataSourceExportConfig.FileCopyDegreeOfParallelism)
            .ForAll(file => ProcessFile(file, destinationBase, options));

        await _sourceFileService
            .UpdateSourceFilesStatusAsync(fileBatch)
            .ConfigureAwait(false);

        batchIndex++;
    }
}

private async Task SetInProgressStatusForBatch(IEnumerable<SourceFile> fileBatch)
{
    foreach (var file in fileBatch)
        file.Status = Status.InProgress;

    await _sourceFileService
        .UpdateSourceFilesStatusAsync(fileBatch)
        .ConfigureAwait(false);
}

private void ProcessFile(
    SourceFile file,
    string destinationBase,
    Options options)
{
    try
    {
        //do something ...

        file.Status = Status.Success;
        file.ExceptionMessage = null;
    }
    catch (Exception ex)
    {
        _logger.Error(ex);
        file.Status = Status.Failed;
        file.ExceptionMessage = ex.Message;
    }
}

如何最大化吞吐量？我阅读了有关BlockingCollection，TPL Dataflow和Rx的生产者-消费者模式的信息，我很确定自己想要实现的目标可以通过上述任何一种实现，但是到目前为止我还不能做到。使用生产者－消费者模式，与消费者相比，我的生产者非常快，有了TPL Dataflow，我陷入了BatchBlock的困境，而我没有尝试过Rx。有人可以指出正确的方向吗？

更新：这是一个最小，完整和可验证的示例：

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading;

namespace ConsoleApp1
{
    internal static class Program
    {
        private static void Main()
        {
            Console.WriteLine("Processing files");

            var stopWatch = new Stopwatch();
            stopWatch.Start();

            var fileService = new FileService();
            fileService.ProcessPendingFiles();

            foreach (var sourceFile in fileService.SourceFiles)
            {
                Console.WriteLine($"{sourceFile.Id} {sourceFile.Status}");
            }

            Console.WriteLine(stopWatch.Elapsed);

            Console.ReadLine();
        }
    }

    public class FileService
    {
        private const int BatchSize = 100;
        private const int DegreeOfParallelism = 10;
        //this SourceFiles property replaces the Sqlite database where the data is actually stored
        public ICollection<SourceFile> SourceFiles =
            Enumerable
                .Range(0, 1000)
                .Select(i =>
                    new SourceFile
                    {
                        Id = i,
                        Path = "source file path",
                        Status = Status.Pending,
                    })
                .ToList();

        public void ProcessPendingFiles()
        {
            while (true)
            {
                var fileBatch = GetSourceFileBatch(BatchSize, Status.Pending);
                if (fileBatch.Count == 0)
                    return;

                SetInProgressStatusForBatch(fileBatch);

                fileBatch
                    .AsParallel()
                    .WithDegreeOfParallelism(DegreeOfParallelism)
                    .ForAll(ProcessFile);

                UpdateSourceFiles(fileBatch);
            }
        }

        private ICollection<SourceFile> GetSourceFileBatch(int batchSize, Status status)
            => SourceFiles
                .Where(sf => sf.Status == status)
                .Take(batchSize)
                .ToList();

        //set status to in progress for all files in the batch
        //and save the changes to database
        //in the application this is actually done with a bulk update and the method is async
        private void SetInProgressStatusForBatch(IEnumerable<SourceFile> fileBatch)
        {
            foreach (var file in fileBatch)
            {
                file.Status = Status.InProgress;

                var sourceFile = SourceFiles.First(sf => sf.Id == file.Id);
                sourceFile.Status = file.Status;
            }
        }

        //set status and exception messages for all files in the batch
        //and save the changes to database
        //in the application this is actually done with a bulk update and the method is async
        private void UpdateSourceFiles(IEnumerable<SourceFile> fileBatch)
        {
            foreach (var file in fileBatch)
            {
                var sourceFile = SourceFiles.First(sf => sf.Id == file.Id);
                sourceFile.Status = file.Status;
                sourceFile.ExceptionMessage = file.ExceptionMessage;
            }
        }

        private void ProcessFile(SourceFile file)
        {
            try
            {
                //do something ...
                Thread.Sleep(20);

                file.Status = Status.Success;
                file.ExceptionMessage = null;
            }
            catch (Exception ex)
            {
                file.Status = Status.Failed;
                file.ExceptionMessage = ex.Message;
            }
        }
    }

    public class SourceFile
    {
        public int Id { get; set; }

        public string Path { get; set; }

        public Status Status { get; set; }

        public string ExceptionMessage { get; set; }
    }

    public enum Status
    {
        Pending,

        InProgress,

        Success,

        Failed,
    }
}

Answer 1

我知道您可能会讨厌这个答案，但最终，这取决于...

我不确定这些文件是什么，它们的住处或对其进行意味着什么处理。我的答案假设您对当前的高峰处理感到满意，您只需要一种更好的方法来确保在此处获得一致的性能，并且它不会落到操作的尾声。我将尝试通过将生产者-消费者模式与BlockingCollection一起使用来回答您的更直接的问题，而不是更改整个方法。

我确实认为您理解了为什么会出现速度下降，但是您不确定如何处理此问题，因为仅当当前批次完成时才获取下一批项目。（不用说这可能是使用消息队列而不是SQL的一个好例子，但这是一个稍微分开的讨论，避免了您的主要问题。）

以下问题已经得到了相当详细的回答：

classic producer consumer pattern using blockingcollection and tasks .net 4 TPL

public class YourCode
{
  private BlockingCollection<object> queue = new BlockingCollection<object>();

  public YourCode()
  {
    var thread = new Thread(StartConsuming);
    thread.IsBackground = true;
    thread.Start();
  }

  public void Produce(object item)
  {
    queue.Add(item);
  }

  private void StartConsuming()
  {
    while (true)
    {
      object item = queue.Take();
      // Add your code to process the item here.
      // Do not start another task or thread. 
    }
  }
}

然后您可以有一个生产者有多个消费者（因为您确实指出生产速度比消费速度快）

Answer 2

该操作当然可以通过您提到的TPL-Dataflow来完成，但是很难知道您是否真正看到了吞吐量的增长。使用任何性能指标，您最好的办法就是尝试不同的方法并衡量结果。

此示例包括最相关的选项，以调整数据流的行为，以便您可以进行实验。该结构大致基于您的示例代码，并带有一些假设。

一个SourcePath产生一批SourceFile
更新SourceFile状态为异步
处理SourceFile已同步

示例：

public class ProcessFilesFlow
{
    private TransformBlock<SourcePath, IEnumerable<SourceFile>> _getSourceFileBatch;
    private TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>> _setStatusToProcessing;
    private TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>> _processFiles;
    private ActionBlock<IEnumerable<SourceFile>> _setStatusToComplete;

    public ProcessFilesFlow()
    {
        //Setup options
        //All of these options and more can be tuned for throughput
        var getSourceFileBatchOptions = new ExecutionDataflowBlockOptions()
        {
            BoundedCapacity = 10, //How many source paths to queue at one time
            MaxDegreeOfParallelism = 10, //How many source paths to get batches for at one time
            EnsureOrdered = false //Process batches as soon as ready
        };
        var setStatusToProcessingOptions = new ExecutionDataflowBlockOptions()
        {
            BoundedCapacity = 10, //How many batches to queue at one time
            MaxDegreeOfParallelism = 10, //Unlimited, how many batches to updates status for
            EnsureOrdered = false //Process batches as soon as ready
        };
        var processFilesOptions = new ExecutionDataflowBlockOptions()
        {
            BoundedCapacity = 10, //Batches to queue at one time
            MaxDegreeOfParallelism = 10, //Batches to work on at the same time
            EnsureOrdered = false //Process batches as soon as ready
        };
        var setStatusToCompleteOptions = new ExecutionDataflowBlockOptions()
        {
            BoundedCapacity = 10, //Batches to queue at one time
            MaxDegreeOfParallelism = 10, //Batches to update at once
            EnsureOrdered = false //Process batches as soon as ready
        };

        //Build the dataflow pipeline
        _getSourceFileBatch = new TransformBlock<SourcePath, IEnumerable<SourceFile>>(path => GetSourceFileBatch(path), getSourceFileBatchOptions);
        _setStatusToProcessing = new TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>>(batch => SetStatusToProcessingAsync(batch), setStatusToProcessingOptions);
        _processFiles = new TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>>(batch => ProcessFiles(batch), processFilesOptions);
        _setStatusToComplete = new ActionBlock<IEnumerable<SourceFile>>(batch => SetStatusToCompleteAsync(batch), setStatusToCompleteOptions);

        //Link the pipeline
        _getSourceFileBatch.LinkTo(_setStatusToProcessing, new DataflowLinkOptions() { PropagateCompletion = true });
        _setStatusToProcessing.LinkTo(_processFiles, new DataflowLinkOptions() { PropagateCompletion = true });
        _processFiles.LinkTo(_setStatusToComplete, new DataflowLinkOptions() { PropagateCompletion = true });
    }

    public async Task ProcessAll(IEnumerable<SourcePath> sourcePaths)
    {
        foreach(var path in sourcePaths)
        {
            await _getSourceFileBatch.SendAsync(path);
        }
        _getSourceFileBatch.Complete();
        await _setStatusToComplete.Completion;
    }

    private IEnumerable<SourceFile> GetSourceFileBatch(SourcePath sourcePath)
    {
        //Get batch of files based on sourcePath
        return Enumerable.Empty<SourceFile>();
    }

    private async Task<IEnumerable<SourceFile>> SetStatusToProcessingAsync(IEnumerable<SourceFile> sourceFiles)
    {
        //Update file status
        foreach (var file in sourceFiles)
            await file.UpdateStatusAsync("In Progress");
        return sourceFiles;
    }

    private IEnumerable<SourceFile> ProcessFiles(IEnumerable<SourceFile> sourceFiles)
    {
        //process files
        foreach (var file in sourceFiles)
            file.Process();
        return sourceFiles;
    }

    private async Task SetStatusToCompleteAsync(IEnumerable<SourceFile> sourceFiles)
    {
        //Update file status
        foreach (var file in sourceFiles)
            await file.UpdateStatusAsync("Completed");
    }
}

其他选项也可用，例如用TransformManyBlock拆分批处理以及并行处理批处理中的单个文件。

Answer 3

这是磁盘操作。并行化不能很好地解决这些问题。磁盘的物理吞吐量有限。用请求轰炸它只会导致整个计算增加寻道时间。有NCQ之类的功能会尝试减轻这种影响，但这些功能都有局限性。

至少对于网络而言，瘫痪会产生一些影响：

在一个请求处于“ Protocoll开销”阶段时使用“媒介”
避开可能存在的“每个连接”限制

但即使在那儿，也有硬性限制。

进行快速磁盘操作的最佳方法是没有可怕的后端磁盘。即不使用旋转磁盘。或者至少以Raid 0或类似结构组织它们。

Answer 4

工作者模式应该为您简化事情，并确保您始终并行处理一致数量的工作单元。

例如，如果您先创建10个任务，并允许它们接受新任务直到没有剩下的任务，则您不再依赖于等待所有线程或任务的全部完成再开始。 / p>

class WorkController
{
    private DataSourceExportConfig _dataSourceExportConfig;
    private SourceFileService _sourceFileService;
    private string destinationBase;

    public async Task CopyPendingFilesAsync(SourcePath sourcePath, Options options)
    {
        await Task.WhenAll(Enumerable.Range(0, 10).Select(x => Worker(sourcePath, options)));
    }

    public async Task Worker(SourcePath sourcePath, Options options)
    {
        SourceFile file = null;

        while (_sourceFileService.GetNextFile(out file))
        {
            ProcessFile(file, destinationBase, options);
        }
    }

    private void ProcessFile(SourceFile file, string destinationBase, Options options)
    {
    }
}

如何最大化进程吞吐量（C＃）？

4 个答案: