我正在尝试优化大型馆藏(约100万件左右)的过程。我正在考虑使用Parallel.ForEach
,但就我而言,这似乎并不一定会使事情变快。
基本上,该过程有两个方面:
我想到的一种优化是通过BlockingCollection
处理集合中已处理过的部分。
理论是,用Parallel.ForEach
处理1.比使用顺序方法要快,但是,只要在CPU绑定操作的结果被阻塞时IO可以是更好的(但是这部分必须是顺序完成...(由于相关的数据库锁定...),因此至少1.优化和更快。
public static class Program
{
[CoreJob]
[RPlotExporter, RankColumn]
public class Paralleling
{
private IEnumerable<int> _items;
[GlobalSetup]
public void Setup()
{
_items = Enumerable.Range(0, 1000);
}
public static long Ackermann(long m, long n)
{
if (m > 0)
{
if (n > 0)
{
return Ackermann(m - 1, Ackermann(m, n - 1));
}
if (n == 0)
{
return Ackermann(m - 1, 1);
}
}
else if (m == 0)
{
if (n >= 0)
{
return n + 1;
}
}
throw new ArgumentOutOfRangeException();
}
[Benchmark]
public async Task ParallelAndSequential()
{
var blockingCollection = new BlockingCollection<(bool, int, int)>();
Task.Run(() =>
{
Parallel.ForEach(_items, item =>
{
Ackermann(0, 2);
var dummy = item % 2 == 0;
blockingCollection.Add((dummy, item, Thread.CurrentThread.ManagedThreadId));
});
blockingCollection.CompleteAdding();
});
using(var streamWriter = new StreamWriter(new MemoryStream()))
{
foreach (var result in blockingCollection.GetConsumingEnumerable())
{
await streamWriter.WriteLineAsync(result.ToString());
await Task.Delay(10);
}
}
}
[Benchmark]
public async Task AllSequential()
{
using(var streamWriter = new StreamWriter(new MemoryStream()))
{
foreach (var item in _items)
{
Ackermann(0, 2);
var dummy = item % 2 == 0;
var result = (dummy, item, Thread.CurrentThread.ManagedThreadId);
await streamWriter.WriteLineAsync(result.ToString());
await Task.Delay(10);
}
}
}
}
public static void Main(params string[] args)
{
var summary = BenchmarkRunner.Run<Paralleling>();
}
}
基准测试结果
// Validating benchmarks:
// ***** BenchmarkRunner: Start *****
// ***** Found 2 benchmark(s) in total *****
// ***** Building 1 exe(s) in Parallel: Start *****
// start dotnet restore /p:UseSharedCompilation=false /p:BuildInParallel=false /m:1 in C:\Users\eperret\Desktop\Playground\ConsoleApp\ConsoleApp\ConsoleApp\bin\Release\netcoreapp2.2\e6babe6d-16ff-42cd-aa3e-d457250f812c
// command took 1.83s and exited with 0
// start dotnet build -c Release --no-restore /p:UseSharedCompilation=false /p:BuildInParallel=false /m:1 in C:\Users\eperret\Desktop\Playground\ConsoleApp\ConsoleApp\ConsoleApp\bin\Release\netcoreapp2.2\e6babe6d-16ff-42cd-aa3e-d457250f812
c
// command took 3.48s and exited with 0
// ***** Done, took 00:00:05 (5.46 sec) *****
// Found 2 benchmarks:
// Paralleling.ParallelAndSequential: Core(Runtime=Core)
// Paralleling.AllSequential: Core(Runtime=Core)
Setup power plan (GUID: 8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c FriendlyName: High performance)// **************************
// Benchmark: Paralleling.ParallelAndSequential: Core(Runtime=Core)
// *** Execute ***
// Launch: 1 / 1
// Execute: dotnet "e6babe6d-16ff-42cd-aa3e-d457250f812c.dll" --benchmarkName "ConsoleApp.Program+Paralleling.ParallelAndSequential" --job "Core" --benchmarkId 0 in C:\Users\eperret\Desktop\Playground\ConsoleApp\ConsoleApp\ConsoleApp\bin\Re
lease\netcoreapp2.2\e6babe6d-16ff-42cd-aa3e-d457250f812c\bin\Release\netcoreapp2.2
// BeforeAnythingElse
// Benchmark Process Environment Information:
// Runtime=.NET Core 2.2.3 (CoreCLR 4.6.27414.05, CoreFX 4.6.27414.05), 64bit RyuJIT
// GC=Concurrent Workstation
// Job: Core(Runtime=Core)
OverheadJitting 1: 1 op, 595300.00 ns, 595.3000 us/op
WorkloadJitting 1: 1 op, 15646340800.00 ns, 15.6463 s/op
WorkloadWarmup 1: 1 op, 15623246700.00 ns, 15.6232 s/op
WorkloadWarmup 2: 1 op, 15633394200.00 ns, 15.6334 s/op
WorkloadWarmup 3: 1 op, 15621610400.00 ns, 15.6216 s/op
WorkloadWarmup 4: 1 op, 15623904400.00 ns, 15.6239 s/op
WorkloadWarmup 5: 1 op, 15628894600.00 ns, 15.6289 s/op
WorkloadWarmup 6: 1 op, 15619927500.00 ns, 15.6199 s/op
// BeforeActualRun
WorkloadActual 1: 1 op, 15622656400.00 ns, 15.6227 s/op
WorkloadActual 2: 1 op, 15625515000.00 ns, 15.6255 s/op
WorkloadActual 3: 1 op, 15615469600.00 ns, 15.6155 s/op
WorkloadActual 4: 1 op, 15631936300.00 ns, 15.6319 s/op
WorkloadActual 5: 1 op, 15619036800.00 ns, 15.6190 s/op
WorkloadActual 6: 1 op, 15622770800.00 ns, 15.6228 s/op
WorkloadActual 7: 1 op, 15625282100.00 ns, 15.6253 s/op
WorkloadActual 8: 1 op, 15621714600.00 ns, 15.6217 s/op
WorkloadActual 9: 1 op, 15641690200.00 ns, 15.6417 s/op
WorkloadActual 10: 1 op, 15661029200.00 ns, 15.6610 s/op
WorkloadActual 11: 1 op, 15625002000.00 ns, 15.6250 s/op
WorkloadActual 12: 1 op, 15614647200.00 ns, 15.6146 s/op
WorkloadActual 13: 1 op, 15630444900.00 ns, 15.6304 s/op
WorkloadActual 14: 1 op, 15620751600.00 ns, 15.6208 s/op
WorkloadActual 15: 1 op, 15639731400.00 ns, 15.6397 s/op
// AfterActualRun
WorkloadResult 1: 1 op, 15622656400.00 ns, 15.6227 s/op
WorkloadResult 2: 1 op, 15625515000.00 ns, 15.6255 s/op
WorkloadResult 3: 1 op, 15615469600.00 ns, 15.6155 s/op
WorkloadResult 4: 1 op, 15631936300.00 ns, 15.6319 s/op
WorkloadResult 5: 1 op, 15619036800.00 ns, 15.6190 s/op
WorkloadResult 6: 1 op, 15622770800.00 ns, 15.6228 s/op
WorkloadResult 7: 1 op, 15625282100.00 ns, 15.6253 s/op
WorkloadResult 8: 1 op, 15621714600.00 ns, 15.6217 s/op
WorkloadResult 9: 1 op, 15641690200.00 ns, 15.6417 s/op
WorkloadResult 10: 1 op, 15625002000.00 ns, 15.6250 s/op
WorkloadResult 11: 1 op, 15614647200.00 ns, 15.6146 s/op
WorkloadResult 12: 1 op, 15630444900.00 ns, 15.6304 s/op
WorkloadResult 13: 1 op, 15620751600.00 ns, 15.6208 s/op
WorkloadResult 14: 1 op, 15639731400.00 ns, 15.6397 s/op
GC: 0 0 0 0 0
// AfterAll
Mean = 15.6255 s, StdErr = 0.0022 s (0.01%); N = 14, StdDev = 0.0081 s
Min = 15.6146 s, Q1 = 15.6208 s, Median = 15.6239 s, Q3 = 15.6304 s, Max = 15.6417 s
IQR = 0.0097 s, LowerFence = 15.6062 s, UpperFence = 15.6450 s
ConfidenceInterval = [15.6164 s; 15.6346 s] (CI 99.9%), Margin = 0.0091 s (0.06% of Mean)
Skewness = 0.66, Kurtosis = 2.36, MValue = 2
// **************************
// Benchmark: Paralleling.AllSequential: Core(Runtime=Core)
// *** Execute ***
// Launch: 1 / 1
// Execute: dotnet "e6babe6d-16ff-42cd-aa3e-d457250f812c.dll" --benchmarkName "ConsoleApp.Program+Paralleling.AllSequential" --job "Core" --benchmarkId 1 in C:\Users\eperret\Desktop\Playground\ConsoleApp\ConsoleApp\ConsoleApp\bin\Release\ne
tcoreapp2.2\e6babe6d-16ff-42cd-aa3e-d457250f812c\bin\Release\netcoreapp2.2
// BeforeAnythingElse
// Benchmark Process Environment Information:
// Runtime=.NET Core 2.2.3 (CoreCLR 4.6.27414.05, CoreFX 4.6.27414.05), 64bit RyuJIT
// GC=Concurrent Workstation
// Job: Core(Runtime=Core)
OverheadJitting 1: 1 op, 313300.00 ns, 313.3000 us/op
WorkloadJitting 1: 1 op, 15627659000.00 ns, 15.6277 s/op
WorkloadWarmup 1: 1 op, 15618290800.00 ns, 15.6183 s/op
WorkloadWarmup 2: 1 op, 15615060100.00 ns, 15.6151 s/op
WorkloadWarmup 3: 1 op, 15640535400.00 ns, 15.6405 s/op
WorkloadWarmup 4: 1 op, 15627643200.00 ns, 15.6276 s/op
WorkloadWarmup 5: 1 op, 15618477200.00 ns, 15.6185 s/op
WorkloadWarmup 6: 1 op, 15630480200.00 ns, 15.6305 s/op
WorkloadWarmup 7: 1 op, 15618496000.00 ns, 15.6185 s/op
// BeforeActualRun
WorkloadActual 1: 1 op, 15643436500.00 ns, 15.6434 s/op
WorkloadActual 2: 1 op, 15633023800.00 ns, 15.6330 s/op
WorkloadActual 3: 1 op, 15622361000.00 ns, 15.6224 s/op
WorkloadActual 4: 1 op, 15624673600.00 ns, 15.6247 s/op
WorkloadActual 5: 1 op, 15622833000.00 ns, 15.6228 s/op
WorkloadActual 6: 1 op, 15631459600.00 ns, 15.6315 s/op
WorkloadActual 7: 1 op, 15637421400.00 ns, 15.6374 s/op
WorkloadActual 8: 1 op, 15623196600.00 ns, 15.6232 s/op
WorkloadActual 9: 1 op, 15640573100.00 ns, 15.6406 s/op
WorkloadActual 10: 1 op, 15621312000.00 ns, 15.6213 s/op
WorkloadActual 11: 1 op, 15633047100.00 ns, 15.6330 s/op
WorkloadActual 12: 1 op, 15624742400.00 ns, 15.6247 s/op
WorkloadActual 13: 1 op, 15626075700.00 ns, 15.6261 s/op
WorkloadActual 14: 1 op, 15622062500.00 ns, 15.6221 s/op
WorkloadActual 15: 1 op, 15627008400.00 ns, 15.6270 s/op
// AfterActualRun
WorkloadResult 1: 1 op, 15643436500.00 ns, 15.6434 s/op
WorkloadResult 2: 1 op, 15633023800.00 ns, 15.6330 s/op
WorkloadResult 3: 1 op, 15622361000.00 ns, 15.6224 s/op
WorkloadResult 4: 1 op, 15624673600.00 ns, 15.6247 s/op
WorkloadResult 5: 1 op, 15622833000.00 ns, 15.6228 s/op
WorkloadResult 6: 1 op, 15631459600.00 ns, 15.6315 s/op
WorkloadResult 7: 1 op, 15637421400.00 ns, 15.6374 s/op
WorkloadResult 8: 1 op, 15623196600.00 ns, 15.6232 s/op
WorkloadResult 9: 1 op, 15640573100.00 ns, 15.6406 s/op
WorkloadResult 10: 1 op, 15621312000.00 ns, 15.6213 s/op
WorkloadResult 11: 1 op, 15633047100.00 ns, 15.6330 s/op
WorkloadResult 12: 1 op, 15624742400.00 ns, 15.6247 s/op
WorkloadResult 13: 1 op, 15626075700.00 ns, 15.6261 s/op
WorkloadResult 14: 1 op, 15622062500.00 ns, 15.6221 s/op
WorkloadResult 15: 1 op, 15627008400.00 ns, 15.6270 s/op
GC: 0 0 0 0 0
// AfterAll
Mean = 15.6289 s, StdErr = 0.0019 s (0.01%); N = 15, StdDev = 0.0072 s
Min = 15.6213 s, Q1 = 15.6228 s, Median = 15.6261 s, Q3 = 15.6330 s, Max = 15.6434 s
IQR = 0.0102 s, LowerFence = 15.6075 s, UpperFence = 15.6484 s
ConfidenceInterval = [15.6212 s; 15.6366 s] (CI 99.9%), Margin = 0.0077 s (0.05% of Mean)
Skewness = 0.69, Kurtosis = 1.99, MValue = 2
Successfully reverted power plan (GUID: 8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c FriendlyName: High performance)
// ***** BenchmarkRunner: Finish *****
// * Export *
BenchmarkDotNet.Artifacts\results\ConsoleApp.Program.Paralleling-report.csv
BenchmarkDotNet.Artifacts\results\ConsoleApp.Program.Paralleling-report-github.md
BenchmarkDotNet.Artifacts\results\ConsoleApp.Program.Paralleling-report.html
BenchmarkDotNet.Artifacts\results\ConsoleApp.Program.Paralleling-measurements.csv
BuildPlots.R
RPlotExporter couldn't find Rscript.exe in your PATH and no R_HOME environment variable is defined
// * Detailed results *
Paralleling.ParallelAndSequential: Core(Runtime=Core)
Runtime = .NET Core 2.2.3 (CoreCLR 4.6.27414.05, CoreFX 4.6.27414.05), 64bit RyuJIT; GC = Concurrent Workstation
Mean = 15.6255 s, StdErr = 0.0022 s (0.01%); N = 14, StdDev = 0.0081 s
Min = 15.6146 s, Q1 = 15.6208 s, Median = 15.6239 s, Q3 = 15.6304 s, Max = 15.6417 s
IQR = 0.0097 s, LowerFence = 15.6062 s, UpperFence = 15.6450 s
ConfidenceInterval = [15.6164 s; 15.6346 s] (CI 99.9%), Margin = 0.0091 s (0.06% of Mean)
Skewness = 0.66, Kurtosis = 2.36, MValue = 2
-------------------- Histogram --------------------
[15.612 s ; 15.645 s) | @@@@@@@@@@@@@@
---------------------------------------------------
Paralleling.AllSequential: Core(Runtime=Core)
Runtime = .NET Core 2.2.3 (CoreCLR 4.6.27414.05, CoreFX 4.6.27414.05), 64bit RyuJIT; GC = Concurrent Workstation
Mean = 15.6289 s, StdErr = 0.0019 s (0.01%); N = 15, StdDev = 0.0072 s
Min = 15.6213 s, Q1 = 15.6228 s, Median = 15.6261 s, Q3 = 15.6330 s, Max = 15.6434 s
IQR = 0.0102 s, LowerFence = 15.6075 s, UpperFence = 15.6484 s
ConfidenceInterval = [15.6212 s; 15.6366 s] (CI 99.9%), Margin = 0.0077 s (0.05% of Mean)
Skewness = 0.69, Kurtosis = 1.99, MValue = 2
-------------------- Histogram --------------------
[15.619 s ; 15.646 s) | @@@@@@@@@@@@@@@
---------------------------------------------------
// * Summary *
BenchmarkDotNet=v0.11.5, OS=Windows 10.0.17134.407 (1803/April2018Update/Redstone4)
Intel Core i7-7820HQ CPU 2.90GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=2.2.202
[Host] : .NET Core 2.2.3 (CoreCLR 4.6.27414.05, CoreFX 4.6.27414.05), 64bit RyuJIT
Core : .NET Core 2.2.3 (CoreCLR 4.6.27414.05, CoreFX 4.6.27414.05), 64bit RyuJIT
Job=Core Runtime=Core
| Method | Mean | Error | StdDev | Rank |
|---------------------- |--------:|---------:|---------:|-----:|
| ParallelAndSequential | 15.63 s | 0.0091 s | 0.0081 s | 1 |
| AllSequential | 15.63 s | 0.0077 s | 0.0072 s | 1 |
// * Hints *
Outliers
Paralleling.ParallelAndSequential: Core -> 1 outlier was removed (15.66 s)
// * Legends *
Mean : Arithmetic mean of all measurements
Error : Half of 99.9% confidence interval
StdDev : Standard deviation of all measurements
Rank : Relative position of current benchmark mean among all benchmarks (Arabic style)
1 s : 1 Second (1 sec)
// ***** BenchmarkRunner: End *****
// ** Remained 0 benchmark(s) to run **
Run time: 00:11:44 (704.57 sec), executed benchmarks: 2
Global total time: 00:11:50 (710.04 sec), executed benchmarks: 2
// * Artifacts cleanup *
Process finished with exit code 0.
我真的不明白为什么Parallel.ForEach
并不比整个顺序策略快,是因为BlockingCollection
是真正的瓶颈吗?
还有另一种策略可以更好地利用Parallel.ForEach
吗?
答案 0 :(得分:2)
您的时间表明写入内存是主要的耗时事项。事实很不错,我的处理器在运行期间几乎没有达到20%的利用率。当您注释掉stream.WriteLine
时,您会发现这些任务可以简化使用并行化的方法:
| Method | Mean | Error | StdDev | Rank |
|---------------------- |----------:|-----------:|---------:|-----:|
| ParallelAndSequential | 564.50 us | 1,759.4 us | 96.44 us | 2 |
| AllSequential | 54.31 us | 400.0 us | 21.92 us | 1 |
好主意是,即使您在编写基准测试之前就觉得它运行缓慢,也要对其进行配置。它将显示哪个零件/线是瓶颈
答案 1 :(得分:1)
我决定遵循建议使用Dataflow,并重构示例代码以更加注重CPU限制(基本上是旧的Thread.Sleep
)。
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace ConsoleApp
{
public static class EnumerableExtensions
{
public static void ForEach<TSource>(this IEnumerable<TSource> source, Action<TSource> action)
{
foreach (var item in source)
{
action(item);
}
}
}
public static class Program
{
[ClrJob]
[RPlotExporter, RankColumn]
public class Paralleling
{
private IEnumerable<int> _items;
[GlobalSetup]
public void Setup()
{
_items = Enumerable.Range(0, 5000);
}
private bool GetMeALongCpuBool(int number)
{
Thread.Sleep(10);
return number % 2 == 0;
}
[Benchmark]
public async Task DataFlow()
{
var transformBlock = new TransformBlock<int, (bool, int, int)>(item =>
{
var dummy = GetMeALongCpuBool(item);
return (dummy, item, Thread.CurrentThread.ManagedThreadId);
}, new ExecutionDataflowBlockOptions
{
MaxDegreeOfParallelism = Environment.ProcessorCount
});
using (var streamWriter = new StreamWriter(new MemoryStream()))
{
var actionBlock = new ActionBlock<(bool, int, int)>(async result =>
await streamWriter.WriteLineAsync(result.ToString()));
transformBlock.LinkTo(actionBlock, new ExecutionDataflowBlockOptions
{
PropagateCompletion = true
});
_items.ForEach(async item => await transformBlock.SendAsync(item));
transformBlock.Complete();
await actionBlock.Completion;
}
}
[Benchmark]
public async Task ParallelAndSequential()
{
var blockingCollection = new BlockingCollection<(bool, int, int)>();
Task.Run(() =>
{
Parallel.ForEach(_items, item =>
{
var dummy = GetMeALongCpuBool(item);
blockingCollection.Add((dummy, item, Thread.CurrentThread.ManagedThreadId));
});
blockingCollection.CompleteAdding();
});
using (var streamWriter = new StreamWriter(new MemoryStream()))
{
foreach (var result in blockingCollection.GetConsumingEnumerable())
{
await streamWriter.WriteLineAsync(result.ToString());
}
}
}
[Benchmark]
public async Task AllSequential()
{
using (var streamWriter = new StreamWriter(new MemoryStream()))
{
foreach (var item in _items)
{
var dummy = GetMeALongCpuBool(item);
var result = (dummy, item, Thread.CurrentThread.ManagedThreadId);
await streamWriter.WriteLineAsync(result.ToString());
}
}
}
}
public static void Main(params string[] args)
{
var summary = BenchmarkRunner.Run<Paralleling>();
}
}
}
出于某些原因,似乎这次Parallel.ForEach
是紧随其后的Dataflow(也许是幕后的参与者系统增加了一些开销,但它提供了与async
/ { {1}})似乎比仅使用具有相对简单的implementation的await
更具侵略性。
在两种情况下(BlockingCollection.GetConsumingEnumerable
和数据流),(它们)仍然比我最初期望的普通顺序旧方法快得多。
新的基准测试结果如下:
Parallel.ForEach
答案 2 :(得分:1)
这里发生了很多事情。
BlockingCollection
成为争用点。BlockingCollection
没有上限,加之CPU绑定任务的工作量很小,这意味着几乎所有输出都在BlockingCollection
中进行缓冲,然后才被IO绑定使用任务,导致大量内存分配。ConfigureAwait(false)
,从而导致多个上下文捕获和恢复。Parallel.ForEach
配置MaxDegreeOfParallelism
选项,这加剧了工作负载的不平衡。该计算机的所有处理器都执行了CPU约束任务,而没有一个空闲任务可用于单个IO约束任务。对配置进行一些更改之后,我获得了ParallelAndSequential
方法胜过AllSequential
方法的结果,比率约为2/5。
_items = Enumerable.Range(0, 100);
Ackermann(m: 3, n: 6);
var blockingCollection = new BlockingCollection<(bool, int, int)>(boundedCapacity: 10);
await streamWriter.WriteLineAsync(result.ToString()).ConfigureAwait(false);
//await Task.Delay(10);
new ParallelOptions() {MaxDegreeOfParallelism = Environment.ProcessorCount - 1};