我有一个整数和字符串字段数组。为了使它不同,我目前逐行复制到新数组中,并且每条记录检查记录是否已经存在于new中,如果不是我复制它。最后,我将新数组复制回原始数据。
它有效,但速度很慢。有没有更快,更简单的方法来做到这一点?
TArrayMixed = record
Field1: integer;
Field2: integer;
Field3: integer;
Field4: string;
Field5: string;
Field6: string;
end;
procedure TForm1.Button10Click(Sender: TObject);
var
ArrayMixed, ArrayMixed_tmp: array of TArrayMixed;
i, j, vIdx: integer;
vExists: boolean;
begin
SetLength(ArrayMixed, 100000);
for i := 0 to 99999 do
begin
ArrayMixed[i].Field1 := 1 + Random(5);
ArrayMixed[i].Field2 := 1 + Random(5);
ArrayMixed[i].Field3 := 1 + Random(5);
ArrayMixed[i].Field4 := 'String';
ArrayMixed[i].Field5 := 'Another string';
ArrayMixed[i].Field6 := 'New string';
end;
// Sort
TArray.Sort<TArrayMixed > (ArrayMixed, TComparer<TArrayMixed > .Construct(function(const Left, Right: TArrayMixed): Integer
begin
Result := MyCompareAMixed(Left, Right);
end
));
// Distinct
SetLength(ArrayMixed_tmp, Length(ArrayMixed));
vIdx := 0;
for i := Low(ArrayMixed) to High(ArrayMixed) do
begin
vExists := False;
for j := Low(ArrayMixed_tmp) to vIdx - 1 do
if (ArrayMixed_tmp[j].Field1 = ArrayMixed[i].Field1) and
(ArrayMixed_tmp[j].Field2 = ArrayMixed[i].Field2) and
(ArrayMixed_tmp[j].Field3 = ArrayMixed[i].Field3) and
(ArrayMixed_tmp[j].Field4 = ArrayMixed[i].Field4) and
(ArrayMixed_tmp[j].Field5 = ArrayMixed[i].Field5) and
(ArrayMixed_tmp[j].Field6 = ArrayMixed[i].Field6) then
begin
vExists := True;
Break;
end;
if not vExists then
begin
ArrayMixed_tmp[vIdx] := ArrayMixed[i];
Inc(vIdx);
end;
end;
SetLength(ArrayMixed_tmp, vIdx);
// now copy back to original array
SetLength(ArrayMixed, 0);
SetLength(ArrayMixed, Length(ArrayMixed_tmp));
for i := Low(ArrayMixed_tmp) to High(ArrayMixed_tmp) do
ArrayMixed[i] := ArrayMixed_tmp[i];
sleep(0);
end;
修改
因为在实际数据中字符串并不完全相同,所以当原始数组像这样填充时,它产生distinct数组的部分会更慢:
编辑#2 :(在编辑#1中复制了错误的代码)
for i := 0 to 999999 do
begin
ArrayMixed[i].Field1 := 1 + Random(5);
ArrayMixed[i].Field2 := 1 + Random(5);
ArrayMixed[i].Field3 := 1 + Random(5);
ArrayMixed[i].Field4 := 'String'+IntToStr(i mod 5);
ArrayMixed[i].Field5 := 'Another string'+IntToStr(i mod 5);
ArrayMixed[i].Field6 := 'New string'+IntToStr( i mod 5);
end;
编辑#3:发布用于排序的代码 - 只排序前3个字段!
TMyArray3 = array[1..3] of Integer;
function CompareIntegerArray3(const lhs, rhs: TMyArray3): Integer;
var
i: Integer;
begin
Assert(Length(lhs) = Length(rhs));
for i := low(lhs) to high(lhs) do
if lhs[i] < rhs[i] then
exit(-1)
else if lhs[i] > rhs[i] then
exit(1);
exit(0);
end;
function GetMyArrayAMixed(const Value: TArrayMixed): TMyArray3;
begin
Result[1] := Value.Field1;
Result[2] := Value.Field2;
Result[3] := Value.Field3;
end;
function MyCompareAMixed(const lhs, rhs: TArrayMixed): Integer;
begin
Result := CompareIntegerArray3(GetMyArrayAMixed(lhs), GetMyArrayAMixed(rhs));
end;
答案 0 :(得分:2)
type
TArrayMixed = record
Field1: integer;
Field2: integer;
Field3: integer;
Field4: string;
Field5: string;
Field6: string;
class operator Equal( const a, b: TArrayMixed ): Boolean;
class function Compare( const L, R: TArrayMixed ): integer; overload; static;
function Compare( const Other: TArrayMixed ): integer; overload;
function GetHashCode( ): integer;
end;
{ TArrayMixed }
class function TArrayMixed.Compare( const L, R: TArrayMixed ): integer;
begin
Result := L.Compare( R );
end;
function TArrayMixed.Compare( const Other: TArrayMixed ): integer;
begin
Result := Field1 - Other.Field1;
if Result = 0
then
begin
Result := Field2 - Other.Field2;
if Result = 0
then
begin
Result := Field3 - Other.Field3;
if Result = 0
then
begin
Result := CompareStr( Field4, Other.Field4 );
if Result = 0
then
begin
Result := CompareStr( Field5, Other.Field5 );
if Result = 0
then
begin
Result := CompareStr( Field6, Other.Field6 );
end;
end;
end;
end;
end;
end;
class operator TArrayMixed.Equal( const a, b: TArrayMixed ): Boolean;
begin
Result := true
{} and ( a.Field1 = b.Field1 )
{} and ( a.Field2 = b.Field2 )
{} and ( a.Field3 = b.Field3 )
{} and ( a.Field4 = b.Field4 )
{} and ( a.Field5 = b.Field5 )
{} and ( a.Field6 = b.Field6 );
end;
function TArrayMixed.GetHashCode: integer;
begin
{$IFOPT Q+}
{$Q-}
{$DEFINE SET_Q_ON}
{$ENDIF}
Result := 17;
Result := Result * 31 + Field1;
Result := Result * 31 + Field2;
Result := Result * 31 + Field3;
Result := Result * 31 + Field4.GetHashCode;
Result := Result * 31 + Field5.GetHashCode;
Result := Result * 31 + Field6.GetHashCode;
{$IFDEF SET_Q_ON}
{$Q+}
{$UNDEF SET_Q_ON}
{$ENDIF}
end;
TDictionary
作为HashSet来检查重复项procedure Test;
var
arr1, arr2: TArray<TArrayMixed>;
idx : integer;
lst : TDictionary<TArrayMixed, integer>;
begin
// fill the array
SetLength( arr1, 100000 );
for idx := low( arr1 ) to high( arr1 ) do
begin
arr1[ idx ].Field1 := 1 + Random( 5 );
arr1[ idx ].Field2 := 1 + Random( 5 );
arr1[ idx ].Field3 := 1 + Random( 5 );
arr1[ idx ].Field4 := 'String' + IntToStr( idx mod 5 );
arr1[ idx ].Field5 := 'Another string' + IntToStr( idx mod 5 );
arr1[ idx ].Field6 := 'New string' + IntToStr( idx mod 5 );
end;
// distinct
lst := TDictionary<TArrayMixed, integer>.Create( TEqualityComparer<TArrayMixed>.Construct(
function( const L, R: TArrayMixed ): Boolean
begin
Result := ( L = R );
end,
function( const i: TArrayMixed ): integer
begin
Result := i.GetHashCode( );
end ) );
try
for idx := low( arr1 ) to high( arr1 ) do
begin
lst.AddOrSetValue( arr1[ idx ], 0 );
end;
arr2 := lst.Keys.ToArray;
finally
lst.Free;
end;
end;
答案 1 :(得分:2)
我会做这样的事情。您基本上可以动态创建结果,并使用二进制搜索同时排序以删除重复项。
> python poemscript.py danger poem.txt
1 In time of danger, not before.
2 The danger passed and all things righted,
要使用此代码,您可以执行以下操作:
function RemoveDuplicates(aSourceArray: TArray<TArrayMixed>): TArray<TArrayMixed>;
var
i: Integer;
index: Integer;
sortList: TList<TArrayMixed>;
begin
sortList := TList<TArrayMixed>.Create;
try
for i := Low(aSourceArray) to High(aSourceArray) do
begin
if not sortList.BinarySearch(aSourceArray[i], index,
TDelegatedComparer<TArrayMixed>.Construct(
function(const L, R: TArrayMixed): integer
begin
Result := L.Field1 - R.Field1;
if Result <> 0 then Exit;
Result := L.Field2 - R.Field2;
if Result <> 0 then Exit;
Result := L.Field3 - R.Field3;
if Result <> 0 then Exit;
Result := CompareStr(L.Field4, R.Field4);
if Result <> 0 then Exit;
Result := CompareStr(L.Field5, R.Field5);
if Result <> 0 then Exit;
Result := CompareStr(L.Field6, R.Field6);
end)) then
begin
sortList.Insert(index, aSourceArray[i]);
end;
end;
Result := sortList.ToArray;
finally
sortList.Free;
end;
end;
答案 2 :(得分:1)
由于您已经对ArrayMixed
进行了排序,因此您无需将每个项目相互比较以查找重复项。重复项已经彼此相邻放置。因此,您只需迭代ArrayMixed
并将当前项目与ArrayMixed_tmp
中的最后一项进行比较。
因此,复制不同的项目可以更快地进行,看起来像这样:
vIdx := 0;
for i := Low(ArrayMixed) to High(ArrayMixed) do
begin
if (vIdx = 0) or // the first item can never be a duplicate
(ArrayMixed_tmp[vIdx].Field1 <> ArrayMixed[i].Field1) or
(ArrayMixed_tmp[vIdx].Field2 <> ArrayMixed[i].Field2) or
(ArrayMixed_tmp[vIdx].Field3 <> ArrayMixed[i].Field3) or
(ArrayMixed_tmp[vIdx].Field4 <> ArrayMixed[i].Field4) or
(ArrayMixed_tmp[vIdx].Field5 <> ArrayMixed[i].Field5) or
(ArrayMixed_tmp[vIdx].Field6 <> ArrayMixed[i].Field6) then
begin
ArrayMixed_tmp[vIdx] := ArrayMixed[i];
Inc(vIdx);
end;
end;
答案 3 :(得分:1)
只检查先前索引旁边的重复项,因为数组已排序。这里也是重复使用的排序比较器。
function RemoveDuplicates(const anArray: array of TArrayMixed): TArray<TArrayMixed>;
var
j, vIdx: integer;
begin
// Sort
TArray.Sort<TArrayMixed > (anArray, TComparer<TArrayMixed >.Construct(function(const Left, Right: TArrayMixed): Integer
begin
Result := MyCompareAMixed(Left, Right);
end
));
// Distinct
SetLength(Result, Length(anArray));
vIdx := 0;
j := 0;
while (j <= High(anArray) do
begin
Result[vIdx] := anArray[j];
Inc(j);
While (j <= High(anArray)) and (MyCompareAMixed(Result[vIdx],anArray[j]) = 0) do
Inc(j);
Inc(vIdx);
end;
SetLength(Result, vIdx);
end;
<强>更新强>:
在对问题的更新中,声明数组仅部分排序。减少重复次数的迭代次数的一种方法是:
答案 4 :(得分:1)
您尚未发布 MyCompareAMixed()函数的代码,因此无法测试实际代码的性能,包括此未定义函数,包括当前的排序性能。
但是,由于您发布的重复检测方法不依赖于排序的数组,我只是从代码中删除了排序。
在没有任何进一步优化的情况下,所产生的重复数据删除过程在不到50毫秒的时间内完成,这不是“慢速”#34;在我的书中重复删除100,000个复杂的项目。即不是单个值,而是多个值记录的项目。
如果出于其他原因需要进行排序,那么您可以根据其他人给出的答案保留排序并优化重复数据删除过程,但我首先会质疑为什么您认为过程很慢以及是否真的超过50毫秒很慢,你的目标是什么?
有可能是添加开销的排序(正如我所说的,没有你的比较功能,我们无法量化它正在添加的开销)所以如果由于其他原因而不需要这种排序,并且如果sub 50-msecs对于重复数组这个数组是可以接受的,然后我将继续执行其他任务。