Question

我有一个记录类型和一个由该记录类型组成的动态数组。我将它传递给mergesort例程并尝试设置其中一个字段属性，其布尔值为true但似乎没有生效。

我研究了通过其他方式对记录数组进行排序（请参阅此快速排序以获取customrecord数组：http://en.wikibooks.org/wiki/Algorithm_Implementation/Sorting/Quicksort#Delphi）或此处：Best way to sort an array（我无法从这里获得这些建议，主要是因为创建一个共同的功能）。这个问题：Sorting of Arrays Alphabetically?很有帮助并且有效，但这种排序速度极慢。

CODE：

type    
       TCustomRecord = Record
        fLine     : AnsiString; //full line
        fsubLine     : AnsiString; // part of full line
        isDuplicate : boolean;  //is that subline duplicate in another line
        isRefrence     : boolean; // is this line from a refrence file or the one being deduped
        fIndex    : Cardinal; // original order line was loaded
       end;
      TCustomRecordArray = array of TCustomRecord; 

function Merge2(var Vals: array of TCustomRecord ):Integer;
var
  AVals: array of TCustomRecord;

   //returns index of the last valid element
  function Merge(I0, I1, J0, J1: Integer):Integer;
  var
    i, j, k, LC:Integer;
  begin
    LC := I1 - I0;
    for i := 0 to LC do
      AVals[i]:=Vals[i + I0];
      //copy lower half or Vals into temporary array AVals

    k := I0;
    i := 0;
    j := J0;
    while ((i <= LC) and (j <= J1)) do
    if (AVals[i].fsubLine < Vals[j].fsubLine) then
    begin
      Vals[k] := AVals[i];
      if Vals[k].isRefrence = False then
        Vals[k].isDuplicate := False;
      inc(i);
      inc(k);
    end
    else if (AVals[i].fsubLine > Vals[j].fsubLine) then
    begin
      Vals[k]:=Vals[j];
      if Vals[k].isRefrence = False then
        Vals[k].isDuplicate := False;
      inc(k);
      inc(j);
    end else
    begin //duplicate
      Vals[k] := AVals[i];
      if Vals[k].isRefrence = False then
        Vals[k].isDuplicate := True;
      inc(i);
      inc(j);
      inc(k);
    end;

    //copy the rest
    while i <= LC do begin
      Vals[k] := AVals[i];
      inc(i);
      inc(k);
    end;

    if k <> j then
      while j <= J1 do begin
        Vals[k]:=Vals[j];
        inc(k);
        inc(j);
      end;

    Result := k - 1;
  end;

 //returns index of the last valid element

  function PerformMergeSort(ALo, AHi:Integer): Integer; //returns
  var
    AMid, I1, J1:Integer;
  begin

  //It would be wise to use Insertion Sort when (AHi - ALo) is small (about 32-100)
    if (ALo < AHi) then
    begin
      AMid:=(ALo + AHi) shr 1;
      I1 := PerformMergeSort(ALo, AMid);
      J1 := PerformMergeSort(AMid + 1, AHi);
      Result := Merge(ALo, I1, AMid + 1, J1);
    end else
      Result := ALo;
  end;

begin
  //SetLength(AVals, Length(Vals) + 1 div 2);
  SetLength(AVals, Length(Vals) div 2 + 1);
  Result := 1 + PerformMergeSort(0, High(Vals));
end;

问题：我如何有效排序，最好使用mergesort，这个记录数组并根据那种排序设置一些属性？谢谢。

更新：我添加了一个指针类型，并在指针数组上做了一个修改过的mergesort。事实证明这是对记录数组进行排序的非常快速的方法。我还添加了一个比较例程，它添加了我需要的标志。我无法做的唯一部分是根据它们属于文件A或参考文件为重复项添加标记。

CODE：

    type    
          PCustomRecord = ^TCustomRecord; 
          TCustomRecord = Record
            fLine     : AnsiString; //full line
            fsubLine  : AnsiString; // part of full line
            isDuplicate : boolean;  //is that subline duplicate in another line
            isRefrence     : boolean; // line from a refrence file or the one being deduped
            isUnique  : boolean; //flag to set if not refrence and not dupe
            fIndex    : Cardinal; // original order line was loaded
           end;
          TCustomRecordArray = array of TCustomRecord;
          PCustomRecordList = ^TCustomRecordArray;

//set up actual array
//set up pointer array to point at actual array
//sort by mergesort first
// then call compare function - this can be a procedure obviously

function Compare(var PRecords: array of PCustomRecord; iLength: int64): Integer;
var
  i : Integer;
begin
  for i := 0 to High(PRecords) do
  begin
    Result := AnsiCompareStr(PRecords[i]^.fsubline, PRecords[i+1]^.fsubline);
    if Result=0 then
    begin
      if (PRecords[i].isrefrence = False) then
        PRecords[i].isduplicate := True
      else if (PRecords[i+1].isrefrence = False) then
        PRecords[i+1].isduplicate := True;
    end;
  end;
end; 

procedure MergeSort(var Vals:array of PCustomRecord;ACount:Integer);
var AVals:array of PCustomRecord;

  procedure Merge(ALo,AMid,AHi:Integer);
  var i,j,k,m:Integer;
  begin
    i:=0;
    for j:=ALo to AMid do
    begin
      AVals[i]:=Vals[j];
      inc(i);
      //copy lower half or Vals into temporary array AVals
    end;

    i:=0;j:=AMid + 1;k:=ALo;//j could be undefined after the for loop!
    while ((k < j) and (j <= AHi)) do
    if (AVals[i].fsubline) <= (Vals[j].fsubline) then
    begin
      Vals[k]:=AVals[i];
      inc(i);inc(k);
    end
    else if (AVals[i].fsubline) > (Vals[j].fsubline) then
    begin
      Vals[k]:=Vals[j];
      inc(k);inc(j);
    end;

    {locate next greatest value in Vals or AVals and copy it to the
     right position.}

    for m:=k to j - 1 do
    begin
      Vals[m]:=AVals[i];
      inc(i);
    end;
    //copy back any remaining, unsorted, elements
  end;

  procedure PerformMergeSort(ALo,AHi:Integer);
  var AMid:Integer;
  begin
    if (ALo < AHi) then
    begin
      AMid:=(ALo + AHi) shr 1;
      PerformMergeSort(ALo,AMid);
      PerformMergeSort(AMid + 1,AHi);
      Merge(ALo,AMid,AHi);
    end;
  end;

begin
  SetLength(AVals, ACount div 2 + 1);
  PerformMergeSort(0,ACount - 1);
end;

对于不到一秒的小文件，这一切都非常快。对数组中带有重复标记而不是引用标志的项进行重新配置是非常具有挑战性的。由于mergesort是一个稳定的类型，我试图通过布尔标志求助但没有达到我的预期。我使用TStringlist来查看我之前的标志是否正确设置并且完美运行。时间从1秒增加到6秒。我知道必须有一种简单的方法来标记isUnique标志，不用 TStringlist。

以下是我的尝试：

function DeDupe(var PRecords: array of PCustomRecord; iLength: int64): Integer;
var
  i : Integer;
begin
  for i := 0 to High(PRecords) do
  begin
    if (PRecords[i]^.isrefrence = False) and (PRecords[i+1]^.isrefrence = false)then
    begin
      Result := AnsiCompareStr(PRecords[i]^.isduplicate, PRecords[i+1]^.isduplicate);
      if Result = 0 then PRecords[i]^.isUnique := True;
    end
    else
    begin
      Continue;
    end;
  end;
end;

这并没有得到所有的值，我没有看到它的差别，因为我仍然看到很多重复。我认为逻辑是错误的。

感谢所有伟大的灵魂帮助。对于所有人，请允许我获得我可能已经知道如何派生TObject以及如何使用TStringList的好处，以便将重点放在数组上。

问题：帮我做上面的功能或程序，用以下标记重复项目： isRefrence = false且isDuplicate = True且唯一

编辑3：我能够通过使用布尔标志来消除重复。这有助于保持阵列稳定而不改变阵列的大小。我相信它比使用TList后代或TStringList要快得多。使用诸如阵列之类的基本容器在编码的容易性方面存在局限性，但是非常有效，所以我不会传递它。指针使分拣变得轻而易举。当我使用指针数组完全像我使用我的常规数组时，我不确定如何将指针设置到我的数组之后。无论我是否退缩，它都没有任何区别。我这样设置了指针数组：

  iLength := Length(Custom_array); //get length of actual array
  SetLength(pcustomRecords, iLength); // make pointer array equal + 1

  for M := Low(Custom_array) to High(Custom_array) do //set up pointers
  begin
    pcustomRecords[M] := @Custom_array[M]; 
  end;

我尝试从尽可能多的实际数据中分类排序，但我确信可以有所改进。

///////////////////////////////////////////////////////////////////
function Comparesubstring(Item1, Item2: PCustomRecord): Integer;
begin
  Result := AnsiCompareStr(item1^.fsubline, item2^.fsubline);
end;
///////////////////////////////////////////////////////////////////
function CompareLine(Item1, Item2: PCustomRecord): Integer;
begin
  Result := AnsiCompareStr(item1^.fLine, item2^.fLine);
end;
///////////////////////////////////////////////////////////////////
function Compare(var PRecords: array of PCustomRecord; iLength: int64): Integer;
var
  M, i : Integer;
begin
  M := Length(PRecords);
  for i := 1 to M-1 do
  begin
    Result := Comparesubstring(PRecords[i-1], PRecords[i]);
    if Result=0 then
    begin
      if (PRecords[i-1].isRefrence = False) then
        PRecords[i-1].isduplicate := True
      else if (PRecords[i].isRefrence = False) then
        PRecords[i].isduplicate := True;
    end;
  end;
end;
///////////////////////////////////////////////////////////////////

Answer 1

1）不要复制数据！使用指针。您应该创建指向这些数据记录的指针的列表/数组，并改为排序指针。排序完成后 - 只需基于指针数组创建新的数据数组。指针移动是单CPU命令。 SizeOf（您的记录）是＆gt;＆gt; SizeOf（指针），移动时速度慢很多。

2）Mergesort晃动巨大的数据量，不适合记忆。如果您有10千兆字节的数据，则无法在Win32程序允许的2GB内存中对它们进行排序。因此，您必须在磁盘上对它们进行排序。这是Mergesort的利基。如果所有数据都在内存中，为什么不使用现成的QuickSort例程呢？

所以创建一个TList，用type PCustomRecord = ^TCustomRecord;指针填充它，实现正确的比较函数，并通过TList.Sort方法调用checked quicksort。

http://docwiki.embarcadero.com/CodeExamples/XE2/en/TListSort_(Delphi)

列表排序后 - 创建并填充新的数据数组。在创建新数组之后 - 释放列表并删除旧的源数组。

如果可能 - 检查数据是否适合内存。如果内存不足，则仅驻留在磁盘上搜索。它会变慢，慢得多。

我在学校做过...... Mergesort不是递归的。这是非常基本的循环。我实现它是因为它简单。我仍然没有QuickSort的直觉，与之比较。

在伪代码中，它看起来像

FrameSize := 1;
Loop start:
  Phase 1: splitting
     Loop until not empty TempMergedDataFile:
        Read record by record from TempMergedDataFile 
            and write each of them into TempSplitDataFile-1
            up to FrameSize times
        Read record by record from TempMergedDataFile 
            and write each of them into TempSplitDataFile-2
            up to FrameSize times
     Loop end
     Delete TempMergedDataFile 
  Phase 2: sorting-merging
     Loop until not empty TempSplitDataFile-1 and TempSplitDataFile-2:
        Read record by record from both TempSplitDataFile-1 and TempSplitDataFile-2
          up to FrameSize each (2xFrameSize in total in each iteration)
          write them sorted into TempMergedDataFile
     end loop
     delete TempSplitDataFile-1 and TempSplitDataFile-2
  Phase 3: update expectations
     FrameSize := FrameSize * 2
     if FrameSize > actual number of records - then exit loop, sort complete
End loop

小心第2阶段的实施。如果一个文件超过了帧，则与实际值或nil进行比较。好吧，这个想法很明显，可能已经在某个地方进行了演示。在这部分只是迂腐。可能FSM实施可能很容易。

显而易见的优化：

将所有文件放在不同的物理专用硬盘上，因此每个硬盘将处于线性读/写模式
合并阶段1和阶段2：使TempMergedDataFile成为虚拟，实际上由TempSplitDataFile-3和TempSplitDataFile-4组成。在写入数据时将数据拆分为下一个大小的帧。
如果使用SSD或闪存卡进行存储，则数据复制会耗尽硬件。最好为实际排序排序某种“指针”或“索引”。还有一个小机会，虽然完整的数据帧超过RAM，但仅仅是“索引数组”适合。但是对于没有测试的实际硬盘驱动器，我最好坚持使用天真的“复制和复制并再次复制”方法。 / LI>

Answer 2

要做的第一个评论是你的基本设计非常薄弱。您已将排序代码和比较/交换代码混合在一起。如果您需要对不同的数据进行排序，那么您将不得不重新开始。您需要将排序代码与理解数据的代码分离。

实现这种解耦的方法是实现一个对数据一无所知的泛型排序例程。相反，它需要知道的是如何比较两个元素，以及如何交换两个元素。所有常见的内存中排序例程都可以通过这种方式有效地实现。

我猜你遇到的另一个问题是，你的代码会花费大量时间来复制数据。而不是这样做，使用一层间接。我的意思是你不应该尝试修改原始数组。而是在数据数组中创建一个索引数组，并对索引数组而不是数据数组进行排序。

为了让您了解这一点，以下是它的外观：

var
  Data: array of TData;
  Indices: array of Integer;

function CompareIndices(Index1, Index2: Integer): Integer;
begin
  Result := CompareData(Data[Indices[Index1]], Data[Indices[Index2]]);
end;

procedure SwapIndices(Index1, Index2: Integer);
var
  Temp: Integer;
begin
  Temp := Indices[Index1];
  Indices[Index1] := Indices[Index2];
  Indices[Index2] := Temp;
end;

然后，为了对数组进行排序，你可以这样做：

N := Length(Data);
SetLength(Indices, N);
for i := 0 to high(Indices) do 
  Indices[i] := i;
Sort(CompareIndices, SwapIndices, N);

或者，作为另一种替代方法，使用指向数据数组元素的指针数组而不是索引数组。

现在，为了清晰起见，我在这里使用了全局变量。实际上，您可能希望将其包装到类中，或者至少使比较和交换函数成为对象的方法。这就是我在Delphi 6代码库中的做法。界面看起来像这样：

type
  TCompareIndicesFunction = function(Index1, Index2: Integer): Integer of object;
  TExchangeIndicesProcedure = procedure(Index1, Index2: Integer) of object;

procedure QuickSort(Compare: TCompareIndicesFunction; 
  Exchange: TExchangeIndicesProcedure; const Count: Integer);

一旦你掌握了将排序算法与数据分开的概念，你就会取得一些进展。然后将一个排序算法替换为另一个排序算法变得微不足道。你可以轻松地比较它们。您可以轻松衡量间接方法是否值得。等等。

所以，对我而言，我绝对的第一条忠告是抛弃问题中的代码，并按照自然意图将数据处理中的分类分开。

Answer 3

我没有直接回答这个问题，但是如果数据适合内存 - 就像你使用数组一样。

我会抛弃所有这些，创建一些对象，将它们放在TObjectList中。排序使用TObjectList.Sort（@myComparefunction）进行自己的比较。您可以声明多个排序例程。在Sort函数中，您可以随意设置其他对象属性。这很快，可以挽救你似乎遭受的痛苦：）

将动态数组记录传递给Delphi中的mergesort

3 个答案: