TParallel.For性能

时间:2014-12-17 21:20:00

标签: multithreading delphi parallel-processing delphi-xe7

鉴于以下在一维数组中寻找奇数的简单任务:

begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  for i := 0 to MaxArr-1 do
      if ArrXY[i] mod 2 = 0 then
        Inc(odds);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('Serial: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
end;

看起来这将是并行处理的一个很好的候选者。因此可能会想要使用以下TParallel.For版本:

begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  TParallel.For(0,  MaxArr-1, procedure(I:Integer)
  begin
    if ArrXY[i] mod 2 = 0 then
      inc(odds);
  end);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('Parallel - false odds: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
end;

这种并行计算的结果在两个方面有点令人惊讶:

  1. 计算的赔率数错误

  2. 执行时间比串行版

  3. 1)可以解释,因为我们没有保护并发访问的odds变量。因此,为了解决这个问题,我们应该使用TInterlocked.Increment(odds);代替。

    2)也可以解释:它表现出false sharing的效果。

    理想情况下,错误共享问题的解决方案是使用局部变量来存储中间结果,并且仅在所有并行任务结束时总结这些中间人。 这是我真正的问题,我无法理解:有没有办法让局部变量进入我的匿名方法?注意,简单地在匿名方法体中声明局部变量是行不通的,因为每次迭代都会调用匿名方法体。如果这在某种程度上是可行的,那么有没有办法在每个任务迭代结束时从匿名方法中得到我的中间结果?

    编辑:我实际上对计算赔率或者埃文斯并不感兴趣。我只用它来证明效果。

    为了完整起见,这里有一个控制台应用程序,展示了这些效果:

    program Project4;
    
    {$APPTYPE CONSOLE}
    
    {$R *.res}
    
    uses
      System.SysUtils, System.Threading, System.Classes, System.SyncObjs;
    
    const
      MaxArr = 100000000;
    
    var
      Ticks: Cardinal;
      i: Integer;
      odds: Integer;
      ArrXY: array of Integer;
    
    procedure FillArray;
    var
      i: Integer;
      j: Integer;
    begin
      SetLength(ArrXY, MaxArr);
      for i := 0 to MaxArr-1 do
          ArrXY[i]:=Random(MaxInt);
    end;
    
    procedure Parallel;
    begin
      odds := 0;
      Ticks := TThread.GetTickCount;
      TParallel.For(0,  MaxArr-1, procedure(I:Integer)
      begin
        if ArrXY[i] mod 2 = 0 then
          TInterlocked.Increment(odds);
      end);
      Ticks := TThread.GetTickCount - Ticks;
      writeln('Parallel: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
    end;
    
    procedure ParallelFalseResult;
    begin
      odds := 0;
      Ticks := TThread.GetTickCount;
      TParallel.For(0,  MaxArr-1, procedure(I:Integer)
      begin
        if ArrXY[i] mod 2 = 0 then
          inc(odds);
      end);
      Ticks := TThread.GetTickCount - Ticks;
      writeln('Parallel - false odds: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
    end;
    
    procedure Serial;
    begin
      odds := 0;
      Ticks := TThread.GetTickCount;
      for i := 0 to MaxArr-1 do
          if ArrXY[i] mod 2 = 0 then
            Inc(odds);
      Ticks := TThread.GetTickCount - Ticks;
      writeln('Serial: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
    end;
    
    begin
      try
        FillArray;
        Serial;
        ParallelFalseResult;
        Parallel;
      except
        on E: Exception do
          Writeln(E.ClassName, ': ', E.Message);
      end;
      Readln;
    end.
    

5 个答案:

答案 0 :(得分:11)

此问题的关键是尽可能少地正确分区和共享。

使用此代码,它的运行速度几乎是串行代码的4倍。

const 
  WorkerCount = 4;

function GetWorker(index: Integer; const oddsArr: TArray<Integer>): TProc;
var
  min, max: Integer;
begin
  min := MaxArr div WorkerCount * index;
  if index + 1 < WorkerCount then
    max := MaxArr div WorkerCount * (index + 1) - 1
  else
    max := MaxArr - 1;
  Result :=
    procedure
    var
      i: Integer;
      odds: Integer;
    begin
      odds := 0;
      for i := min to max do
        if Odd(ArrXY[i]) then
          Inc(odds);
      oddsArr[index] := odds;
    end;
end;

procedure Parallel;
var
  i: Integer;
  oddsArr: TArray<Integer>;
  workers: TArray<ITask>;
begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  SetLength(oddsArr, WorkerCount);
  SetLength(workers, WorkerCount);

  for i := 0 to WorkerCount-1 do
    workers[i] := TTask.Run(GetWorker(i, oddsArr));
  TTask.WaitForAll(workers);

  for i := 0 to WorkerCount-1 do
    Inc(odds, oddsArr[i]);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('Parallel: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
end;

您可以使用TParallel.For编写类似的代码,但它仍然比使用TTask运行速度慢(比串行速度快3倍)。

顺便说一句,我使用该函数返回工作者TProc以使索引捕获正确。如果在同一例程中循环运行它,则捕获循环变量。

更新19.12.2014:

由于我们发现关键的事情是正确的分区,因此可以非常容易地将其放入并行for循环中,而无需将其锁定在特定的数据结构上:

procedure ParallelFor(lowInclusive, highInclusive: Integer;
  const iteratorRangeEvent: TProc<Integer, Integer>);

  procedure CalcPartBounds(low, high, count, index: Integer;
    out min, max: Integer);
  var
    len: Integer;
  begin
    len := high - low + 1;
    min := (len div count) * index;
    if index + 1 < count then
      max := len div count * (index + 1) - 1
    else
      max := len - 1;
  end;

  function GetWorker(const iteratorRangeEvent: TProc<Integer, Integer>;
    min, max: Integer): ITask;
  begin
    Result := TTask.Run(
      procedure
      begin
        iteratorRangeEvent(min, max);
      end)
  end;

var
  workerCount: Integer;
  workers: TArray<ITask>;
  i, min, max: Integer;
begin
  workerCount := TThread.ProcessorCount;
  SetLength(workers, workerCount);
  for i := 0 to workerCount - 1 do
  begin
    CalcPartBounds(lowInclusive, highInclusive, workerCount, i, min, max);
    workers[i] := GetWorker(iteratorRangeEvent, min, max);
  end;
  TTask.WaitForAll(workers);
end;

procedure Parallel4;
begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  ParallelFor(0, MaxArr-1,
    procedure(min, max: Integer)
    var
      i, n: Integer;
    begin
      n := 0;
      for i := min to max do
        if Odd(ArrXY[i]) then
          Inc(n);
      AtomicIncrement(odds, n);
    end);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('ParallelEx: Stefan Glienke ' + Ticks.ToString + ' ms, odds: ' + odds.ToString);
end;

关键是要使用局部变量进行计数,并且最后只使用共享变量一次来添加子总数。

答案 1 :(得分:4)

使用SVN的OmniThreadLibrary(这还没有包含在任何正式版本中),您可以用不需要互锁访问共享计数器的方式编写它。

function CountParallelOTL: integer;
var
  counters: array of integer;
  numCores: integer;
  i: integer;
begin
  numCores := Environment.Process.Affinity.Count;
  SetLength(counters, numCores);
  FillChar(counters[0], Length(counters) * SizeOf(counters[0]), 0);

  Parallel.For(0, MaxArr - 1)
    .NumTasks(numCores)
    .Execute(
      procedure(taskIndex, value: integer)
      begin
        if Odd(ArrXY[value]) then
          Inc(counters[taskIndex]);
      end);

  Result := counters[0];
  for i := 1 to numCores - 1 do
    Inc(Result, counters[i]);
end;

然而,这仍然与顺序循环相提并论,最坏的情况要慢一些。

我将它与Stefan的解决方案(XE7任务)和一个简单的XE7 Parallel.For进行了比较,具有互锁增量(XE7 for)。

笔记本上有4个超线程核心的结果:

  

序列号:在543 ms内找到49999640个奇数元素

     

并行(OTL):在555 ms中找到49999640个奇数元素

     

并行(XE7任务):在136毫秒内找到49999640个奇数元素

     

并行(XE7 for):在1667 ms中找到49999640个奇数元素

我的工作站有12个超线程核心的结果:

  

序列号:在685毫秒内找到50005291个奇数元素

     

并行(OTL):在1309 ms中找到50005291个奇数元素

     

并行(XE7任务):在62毫秒内找到50005291个奇数元素

     

并行(XE7 for):在3379 ms中找到50005291个奇数元素

与System.Threading Paralell相比有了很大的改进。因为没有互锁的增量,但手工制作的解决方案要快得多。

完整的测试程序:

program ParallelCount;

{$APPTYPE CONSOLE}

{$R *.res}

uses
  System.SyncObjs,
  System.Classes,
  System.SysUtils,
  System.Threading,
  DSiWin32,
  OtlCommon,
  OtlParallel;

const
  MaxArr = 100000000;

var
  Ticks: Cardinal;
  i: Integer;
  odds: Integer;
  ArrXY: array of Integer;

procedure FillArray;
var
  i: Integer;
  j: Integer;
begin
  SetLength(ArrXY, MaxArr);
  for i := 0 to MaxArr-1 do
    ArrXY[i]:=Random(MaxInt);
end;

function CountSerial: integer;
var
  odds: integer;
begin
  odds := 0;
  for i := 0 to MaxArr-1 do
      if Odd(ArrXY[i]) then
        Inc(odds);
  Result := odds;
end;

function CountParallelOTL: integer;
var
  counters: array of integer;
  numCores: integer;
  i: integer;
begin
  numCores := Environment.Process.Affinity.Count;
  SetLength(counters, numCores);
  FillChar(counters[0], Length(counters) * SizeOf(counters[0]), 0);

  Parallel.For(0, MaxArr - 1)
    .NumTasks(numCores)
    .Execute(
      procedure(taskIndex, value: integer)
      begin
        if Odd(ArrXY[value]) then
          Inc(counters[taskIndex]);
      end);

  Result := counters[0];
  for i := 1 to numCores - 1 do
    Inc(Result, counters[i]);
end;

function GetWorker(index: Integer; const oddsArr: TArray<Integer>; workerCount: integer): TProc;
var
  min, max: Integer;
begin
  min := MaxArr div workerCount * index;
  if index + 1 < workerCount then
    max := MaxArr div workerCount * (index + 1) - 1
  else
    max := MaxArr - 1;
  Result :=
    procedure
    var
      i: Integer;
      odds: Integer;
    begin
      odds := 0;
      for i := min to max do
        if Odd(ArrXY[i]) then
          Inc(odds);
      oddsArr[index] := odds;
    end;
end;

function CountParallelXE7Tasks: integer;
var
  i: Integer;
  oddsArr: TArray<Integer>;
  workers: TArray<ITask>;
  workerCount: integer;
begin
  workerCount := Environment.Process.Affinity.Count;
  odds := 0;
  Ticks := TThread.GetTickCount;
  SetLength(oddsArr, workerCount);
  SetLength(workers, workerCount);

  for i := 0 to workerCount-1 do
    workers[i] := TTask.Run(GetWorker(i, oddsArr, workerCount));
  TTask.WaitForAll(workers);

  for i := 0 to workerCount-1 do
    Inc(odds, oddsArr[i]);
  Result := odds;
end;

function CountParallelXE7For: integer;
var
  odds: integer;
begin
  odds := 0;
  TParallel.For(0,  MaxArr-1, procedure(I:Integer)
  begin
    if Odd(ArrXY[i]) then
      TInterlocked.Increment(odds);
  end);
  Result := odds;
end;

procedure Count(const name: string; func: TFunc<integer>);
var
  time: int64;
  cnt: integer;
begin
  time := DSiTimeGetTime64;
  cnt := func();
  time := DSiElapsedTime64(time);
  Writeln(name, ': ', cnt, ' odd elements found in ', time, ' ms');
end;

begin
  try
    FillArray;

    Count('Serial', CountSerial);
    Count('Parallel (OTL)', CountParallelOTL);
    Count('Parallel (XE7 tasks)', CountParallelXE7Tasks);
    Count('Parallel (XE7 for)', CountParallelXE7For);

    Readln;
  except
    on E: Exception do
      Writeln(E.ClassName, ': ', E.Message);
  end;
end.

答案 2 :(得分:2)

我认为我们在讨论OmniThreadLibrary之前已经讨论过这个问题。多线程解决方案时间较长的主要原因是TParallel.For的开销与实际计算所需的时间相比。

本地变量在这里没有任何帮助,而全局threadvar可能会解决错误共享问题。唉,在完成循环后,你可能找不到总结所有这些踏板的方法。

IIRC,最好的方法是在合理的部分中完成任务,并为每次迭代处理一系列数组条目,并增加专用于该部分的变量。仅凭这一点就无法解决错误共享问题,即使它们碰巧只是同一个缓存行的一部分,即使使用不同的变量也会出现这种情况。

另一种解决方案可能是编写一个以串行方式处理数组给定切片的类,并行处理该类的多个实例,然后评估结果。

顺便说一句:你的代码不计算赔率 - 它算上了平均值。

并且:有一个名为Odd的内置函数通常比您正在使用的mod代码具有更好的性能。

答案 3 :(得分:2)

好的,在Stefan Glienke的回答的启发下,我起草了一个更可重用的TParalleEx类,而不是ITasks使用IFutures。该类在某种程度上也是在带有聚合委托的C#TPL之后建模的。这只是初稿,但展示了如何相对容易地扩展现有的PPL。这个版本现在可以在我的系统上完美扩展 - 如果其他人可以在不同的配置上测试它,我会很高兴。感谢所有人的丰富回答和评论。

program Project4;

{$APPTYPE CONSOLE}

{$R *.res}

uses
  System.SysUtils, System.Threading, System.Classes, System.SyncObjs;

const
  MaxArr = 100000000;

var
  Ticks: Cardinal;
  i: Integer;
  odds: Integer;
  ArrXY: TArray<Integer>;

type

TParallelEx<TSource, TResult> = class
  private
    class function GetWorker(body: TFunc<TArray<TSource>, Integer, Integer, TResult>; source: TArray<TSource>; min, max: Integer): TFunc<TResult>;
  public
    class procedure &For(source: TArray<TSource>;
                         body: TFunc<TArray<TSource>, Integer, Integer, TResult>;
                         aggregator: TProc<TResult>);
  end;

procedure FillArray;
var
  i: Integer;
  j: Integer;
begin
  SetLength(ArrXY, MaxArr);
  for i := 0 to MaxArr-1 do
      ArrXY[i]:=Random(MaxInt);
end;

procedure Parallel;
begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  TParallel.For(0,  MaxArr-1, procedure(I:Integer)
  begin
    if ArrXY[i] mod 2 <> 0 then
      TInterlocked.Increment(odds);
  end);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('Parallel: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
end;

procedure Serial;
begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  for i := 0 to MaxArr-1 do
      if ArrXY[i] mod 2 <> 0 then
        Inc(odds);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('Serial: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
end;

const
  WorkerCount = 4;

function GetWorker(index: Integer; const oddsArr: TArray<Integer>): TProc;
var
  min, max: Integer;
begin
  min := MaxArr div WorkerCount * index;
  if index + 1 < WorkerCount then
    max := MaxArr div WorkerCount * (index + 1) - 1
  else
    max := MaxArr - 1;
  Result :=
    procedure
    var
      i: Integer;
      odds: Integer;
    begin
      odds := 0;
      for i := min to max do
        if ArrXY[i] mod 2 <> 0 then
          Inc(odds);
      oddsArr[index] := odds;
    end;
end;

procedure Parallel2;
var
  i: Integer;
  oddsArr: TArray<Integer>;
  workers: TArray<ITask>;
begin
  odds := 0;
  Ticks := TThread.GetTickCount;
  SetLength(oddsArr, WorkerCount);
  SetLength(workers, WorkerCount);

  for i := 0 to WorkerCount-1 do
    workers[i] := TTask.Run(GetWorker(i, oddsArr));
  TTask.WaitForAll(workers);

  for i := 0 to WorkerCount-1 do
    Inc(odds, oddsArr[i]);
  Ticks := TThread.GetTickCount - Ticks;
  writeln('Parallel: Stefan Glienke ' + Ticks.ToString + ' ms, odds: ' + odds.ToString);
end;

procedure parallel3;
var
  sum: Integer;
begin
  Ticks := TThread.GetTickCount;
  TParallelEx<Integer, Integer>.For( ArrXY,
     function(Arr: TArray<Integer>; min, max: Integer): Integer
      var
        i: Integer;
        res: Integer;
      begin
        res := 0;
        for i := min to max do
          if Arr[i] mod 2 <> 0 then
            Inc(res);
        Result := res;
      end,
      procedure(res: Integer) begin sum := sum + res; end );
  Ticks := TThread.GetTickCount - Ticks;
  writeln('ParallelEx: Markus Joos ' + Ticks.ToString + ' ms, odds: ' + odds.ToString);
end;

{ TParallelEx<TSource, TResult> }

class function TParallelEx<TSource, TResult>.GetWorker(body: TFunc<TArray<TSource>, Integer, Integer, TResult>; source: TArray<TSource>; min, max: Integer): TFunc<TResult>;
begin
  Result := function: TResult
  begin
    Result := body(source, min, max);
  end;
end;

class procedure TParallelEx<TSource, TResult>.&For(source: TArray<TSource>;
  body: TFunc<TArray<TSource>, Integer, Integer, TResult>;
  aggregator: TProc<TResult>);
var
  I: Integer;
  workers: TArray<IFuture<TResult>>;
  workerCount: Integer;
  min, max: integer;
  MaxIndex: Integer;
begin
  workerCount := TThread.ProcessorCount;
  SetLength(workers, workerCount);
  MaxIndex := length(source);
  for I := 0 to workerCount -1 do
  begin
    min := (MaxIndex div WorkerCount) * I;
    if I + 1 < WorkerCount then
      max := MaxIndex div WorkerCount * (I + 1) - 1
    else
      max := MaxIndex - 1;
    workers[i]:= TTask.Future<TResult>(GetWorker(body, source, min, max));
  end;
  for i:= 0 to workerCount-1 do
  begin
    aggregator(workers[i].Value);
  end;
end;

begin
  try
    FillArray;
    Serial;
    Parallel;
    Parallel2;
    Parallel3;
  except
    on E: Exception do
      Writeln(E.ClassName, ': ', E.Message);
  end;
  Readln;
end.

答案 4 :(得分:-1)

关于使用局部变量收集总和然后在最后收集它们的任务,您可以为此目的使用单独的数组:

var
  sums: array of Integer;
begin
  SetLength(sums, MaxArr);
  for I := 0 to MaxArr-1 do
    sums[I] := 0;

  Ticks := TThread.GetTickCount;
  TParallel.For(0, MaxArr-1,
    procedure(I:Integer)
    begin
      if ArrXY[i] mod 2 = 0 then
        Inc(sums[I]);
    end
  );
  Ticks := TThread.GetTickCount - Ticks;

  odds := 0;
  for I := 0 to MaxArr-1 do
    Inc(odds, sums[i]);

  writeln('Parallel - false odds: ' + Ticks.ToString + 'ms, odds: ' + odds.ToString);
end;