我正在编写并行代码来枚举大量的CSV文件,每个文件都包含历史股票数据(超过6500个符号),并计算每个股票是否达到历史最高点。
我已经实现了一个线程池和TThread后代类来在线程之间平均分割符号列表,然后将这些线程分配给我的i7机器的SEPARATE核心。我将每个线程设置为在创建它们之前创建它们所需的所有数据的副本,然后取消它们,因此在线程处理时不需要锁定或继续进行。完成所有线程后,我将每个线程的结果数据聚合到主程序中。
我目前使用提到的https://stackoverflow.com/questions/6072269/need-multi-threading-memory-manager/6076407#6076407多个多线程内存管理器测试了我的代码。到目前为止,SapMM似乎是最有效的,不会导致访问冲突。
问题是添加更多线程不会成比例地加快完成计算所有高点所需的时间。使用2" core-d"螺纹不会将运行时间缩短1/2,但3不会完全削减到1/3,而4不会削减到接近1/4。
线程数 1, 2, 3, 4
预计加速时间(mm:ss) 6:37, 3:17 1/2, 2:12 1/3, 1:39 1/4
实际时间(mm:ss) 6:37, 4:07, 3:05, 2:51
我已经到了需要一些额外见解的地步,以获得此操作的全面加速。我需要深入了解为什么多核加速会落后,而不仅仅是围绕问题的边缘问题。"那么,导致此代码停止获得相应收益的原因是什么,以及我需要做些什么来实现这些收益呢?简而言之,是否有其他方法可以加快我正在进行的解析,例如:而不是使用TMemoryStream?
我使用的代码如下。
我正在使用Delphi XE4 Enterprise。
在每个线程中,我遍历每个符号并且:
(1)经过测试,并且不会花费任何时间(将所有6500一次加载到内存中所需的总时间不到1秒)。我在(2)中使用的程序是一直以来所做的,并列在下面:
unit uTest;
implementation
uses
SysUtils, Math, Classes;
type
TDayIndexData = record
Date: TDate;
Open, High, Low, Close, AdjClose,
Volume: extended;
end;
type
TTimeUnit = (tuDay, tuWeek, tuMonth, tuYear);
TTimePeriod = record
Length: integer;
TimeUnit: TTimeUnit;
end;
//#NO CHANGE
const
AllDataPeriodStr = 'All Data';
type
TRatePeriod = record
PeriodStr: string;
TimePeriod: TTimePeriod;
end;
type
TFieldType = (ftDate, ftOpen, ftHigh, ftLow, ftClose, ftVolume, ftAdjClose);
const CSV_DELIM_CHARSET = [#0..#31, ',',#127];
type
TShallowEquityNewHighInfoRetrievalResults = record
Success: boolean;
High: extended;
end;
function ShallowEquityNewHighInfoRetrieval(
AStream: TStream;
ARatePeriod: TRatePeriod;
AGetNormalData: boolean = False): TShallowEquityNewHighInfoRetrievalResults;
var
vStreamSize: int64;
function EOF: boolean;
begin
Result := AStream.Position >= vStreamSize;//AStream.Size;
end;
procedure GotoEOF;
begin
AStream.Seek(0, soFromEnd);
end;
//#OPTIMIZE
//var
//vBuffer: FileString;
type
FileChar = AnsiChar;
FileString = AnsiString;
const
ResultCharSize = SizeOf(FileChar);
var
MRReadChar: FileChar;
procedure ReadNextChar;
begin
if not EOF then
AStream.Read(MRReadChar, SizeOf(MRReadChar)) else
raise EInvalidOperation.Create('Unexpected end of file found');
end;
var
vPossDelimChars: boolean;
procedure SkipExistingDelimChars;
begin
//*INTENTION: prevents redundant SkipDelimChars calls, which is destructive
if not vPossDelimChars then Exit;
//not requiring DelimChars
if EOF then Exit;
repeat
ReadNextChar;
until EOF or not (MRReadChar in CSV_DELIM_CHARSET);
//#*NOTE: technically can be true if EOF,
//but if EOF then CurChar is never used 3/13/2014
vPossDelimChars := False;
end;
function SOF: boolean;
begin
Result := AStream.Position = 0;
end;
function NextChars(ACount: integer): FileString;
begin
//#OPTIMIZE: condition
if ResultCharSize = 1 then
begin
SetLength(Result, Min(ACount, vStreamSize{AStream.Size} - AStream.Position));
AStream.Read(Pointer(Result)^, Length(Result));
AStream.Seek(-Length(Result), soFromCurrent);
end else
begin
SetLength(Result, Min(ACount, (vStreamSize{AStream.Size} - AStream.Position) div ResultCharSize));
AStream.Read(Pointer(Result)^, Length(Result) * ResultCharSize);
AStream.Seek(-Length(Result) * ResultCharSize, soFromCurrent);
end;
end;
procedure GotoNextChars(ACount: integer);
begin
//#OPTIMIZE: condition
if ResultCharSize = 1 then
AStream.Seek(ACount, soFromCurrent) else
AStream.Seek(ACount*SizeOf(FileChar), soFromCurrent);
end;
procedure GotoPrevChars(ACount: integer);
begin
//#OPTIMIZE: condition
if ResultCharSize = 1 then
AStream.Seek(-ACount, soFromCurrent) else
AStream.Seek(-ACount*SizeOf(FileChar), soFromCurrent);
end;
procedure GotoPreceedingEOLN(ForItem: boolean = False);
var
vOrigPos: integer;
const
NMinRowChars = 17;//Length('3-13-13,1,1,1,1,1')
begin
//assumes will not hit SOF
//assumes ending CRLF taken care of by other places
vOrigPos := AStream.Position;
vPossDelimChars := True;
while (NextChars(2) <> #13#10) or (AStream.Position = vOrigPos) do
if (Length(NextChars(2)) = 2) and (NextChars(2)[2] = #10) and
(AStream.Position < vOrigPos - SizeOf(FileChar)) then
begin
GotoNextChars(1);
Exit;
end else
if (AStream.Position = vOrigPos) and ForItem then
GotoPrevChars(NMinRowChars) else
GotoPrevChars(1);
end;
var
CurField: string;
CurCol: integer;
procedure InitParsingState;
begin
//Initialize Parsing State
CurCol := -1;
vPossDelimChars := True;
SkipExistingDelimChars;
vStreamSize := AStream.Size;
end;
procedure BacktrackTo(APos: integer; ASafeMode: boolean = False);
begin
if ASafeMode then
AStream.Seek(Pred(APos), soFromBeginning) else
AStream.Seek(APos, soFromBeginning);
ReadNextChar;
vPossDelimChars := False;
CurCol := Ord(High(TFieldType));
end;
procedure ReadQuotedText;
var
vHadPrevQuoteChar: boolean;
begin
vHadPrevQuoteChar := False;
while MRReadChar = '"' do
begin
if vHadPrevQuoteChar then
CurField := CurField + MRReadChar;
ReadNextChar;
while MRReadChar <> '"' do
begin
CurField := CurField + MRReadChar;
ReadNextChar;
end;
if EOF then
break;
ReadNextChar;
vHadPrevQuoteChar := True;
end;
end;
procedure GetNextFieldValue;
begin
if EOF then Exit;
CurCol := (CurCol+1) mod Succ(Ord(High(TFieldType)));
CurField := '';
if MRReadChar = '"' then
ReadQuotedText else
begin
repeat
CurField := CurField + MRReadChar;
if not EOF then
ReadNextChar;
until EOF or (MRReadChar in CSV_DELIM_CHARSET);
if EOF then
if not (MRReadChar in CSV_DELIM_CHARSET) then
CurField := CurField + MRReadChar;
end;
vPossDelimChars := True;
SkipExistingDelimChars;
end;
var
ColFieldTypes: array [Ord(Low(TFieldType))..Ord(High(TFieldType))] of TFieldType;
procedure ResolveCurColFieldType;
var
vField: string;
begin
vField := LowerCase(CurField);
if vField = 'date' then
ColFieldTypes[CurCol] := ftDate else
if vField = 'open' then
ColFieldTypes[CurCol] := ftOpen else
if vField = 'high' then
ColFieldTypes[CurCol] := ftHigh else
if vField = 'low' then
ColFieldTypes[CurCol] := ftLow else
if vField = 'close' then
ColFieldTypes[CurCol] := ftClose else
if vField = 'volume' then
ColFieldTypes[CurCol] := ftVolume else
if Pos('close', vField) > 0 then
ColFieldTypes[CurCol] := ftAdjClose else
raise EInvalidOperation.Create('Unrecognized file format: unrecognized column name found.');
end;
procedure WriteItemAsFieldValue(var AData: TDayIndexData);
begin
case ColFieldTypes[CurCol] of
ftDate:AData.Date := ExStrToDate(CurField);
ftOpen:AData.Open := StrToFloat(CurField);
ftHigh:AData.High := StrToFloat(CurField);
ftLow:AData.Low := StrToFloat(CurField);
ftClose:AData.Close := StrToFloat(CurField);
ftVolume:AData.Volume := StrToFloat(CurField);
ftAdjClose:AData.AdjClose := StrToFloat(CurField);
end;
end;
procedure VerifyFields;
var
iField: TFieldType;
iColumn: integer;
IsUsedFlags: array [Low(TFieldType)..High(TFieldType)] of boolean;
begin
//* Set all to false
for iField := Low(TFieldType) to High(TFieldType) do
IsUsedFlags[iField] := False;
//* set found to true
for iColumn := Low(ColFieldTypes) to High(ColFieldTypes) do
IsUsedFlags[ColFieldTypes[iColumn]] := True;
//* throw error on first one not found
for iField := Low(TFieldType) to High(TFieldType) do
if not IsUsedFlags[iField] then
begin
raise EInvalidOperation.Create('Bad file format: one or more column names are missing!');
break;
end;
end;
procedure LoadHeader;
var
iField: TFieldType;
begin
for iField := Low(TFieldType) to High(TFieldType) do
begin
GetNextFieldValue;
ResolveCurColFieldType;
end;
VerifyFields;
if EOF then
raise EInvalidOperation.Create('Cannot complete shallow Equity New High Info Retrieval: Not enough Data')
end;
procedure LoadRowInto(var ADayData: TDayIndexData);
var
iField: TFieldType;
begin
for iField := Low(TFieldType) to High(TFieldType) do
begin
GetNextFieldValue;
WriteItemAsFieldValue(ADayData);
end;
end;
var
OrderReversed: boolean;
vTopDay,
vBottomDay,
vFirstDay,
vEarlierDay,
vLastDay: TDayIndexData;
vBeginDate: TDate;
vBeforeLastDayPos,
vFirstDayPos,
vAfterFirstDayPos: integer;
function HasUnprocessedDays: boolean;
begin
//** use Position of stream because we don't always have the first day in the
// file, due to optimization
Result := (
((AStream.Position > vFirstDayPos) and not OrderReversed) or
(((AStream.Position < AStream.Size - SizeOf(FileChar)*Length(#13#10)) or
(AStream.Position < AStream.Size - SizeOf(FileChar)*Length(#10)))
and OrderReversed));
end;
function NotYetCoveredTimePeriod: boolean;
begin
Result :=
(ARatePeriod.PeriodStr = AllDataPeriodStr)
or
(
(ARatePeriod.PeriodStr <> AllDataPeriodStr) and
(vEarlierDay.Date >= vBeginDate)
);
end;
function FoundAllNeededData: boolean;
begin
Result := (
(ARatePeriod.PeriodStr <> AllDataPeriodStr) and
(vEarlierDay.Date <= vBeginDate)
) or
(ARatePeriod.PeriodStr = AllDataPeriodStr);
end;
procedure GotoLastDay;
begin
//** Goto End of File
GotoEOF;
//** Goto Just before Last Day
GotoPreceedingEOLN;
if (AStream.Position = AStream.Size - SizeOf(FileChar)*Length(#13#10)) or
(AStream.Position = AStream.Size - SizeOf(FileChar)*Length(#10)) then
GotoPreceedingEOLN;
SkipExistingDelimChars;
end;
procedure DetermineDataOrder;
begin
//#ASSUMPTION: assume end day at BOTTOM of file if latest data less than 2 days ago
//Problem when NDays = 2 ?
if Trunc(Now) - Trunc(vBottomDay.Date) >= 2 then
begin
//** Get Top Day
BacktrackTo(vFirstDayPos, True);
LoadRowInto(vTopDay);
//** Determine what order the data is in
OrderReversed := vBottomDay.Date < vTopDay.Date;
if not OrderReversed then
BacktrackTo(vBeforeLastDayPos, True);
if OrderReversed then
vFirstDay := vBottomDay else
vFirstDay := vTopDay;
if OrderReversed then
vLastDay := vTopDay else
vLastDay := vBottomDay;
end else
begin
OrderReversed := False;
//vLastDay := vTopDay;
vLastDay := vBottomDay;
end;
end;
procedure LoadPrevRow;
var
vBeforeDayPos: integer;
begin
GotoPreceedingEOLN(True);
vBeforeDayPos := AStream.Position;
SkipExistingDelimChars;
LoadRowInto(vEarlierDay);
AStream.Seek(vBeforeDayPos, soFromBeginning);
end;
begin
//* Initialize
Result.Success := False;
AStream.Seek(0, soFromBeginning);
InitParsingState;
//** Load CSV Header
LoadHeader;
vFirstDayPos := AStream.Position;
//** Get Last Day
GotoLastDay;
vBeforeLastDayPos := AStream.Position;
LoadRowInto(vBottomDay);
//** IF Only 1 Data Day:
if vFirstDayPos = vBeforeLastDayPos then
begin
//return results
Result.Success := True;
Result.High := vBottomDay.High;
Exit;
end;
//** Go back to Last Day in File
BacktrackTo(vBeforeLastDayPos);
//** Determine what order the data is in
DetermineDataOrder;
//** Determine Date to scan back to if opted for
if ARatePeriod.PeriodStr <> AllDataPeriodStr then
vBeginDate := MoveDateBack(vLastDay.Date, ARatePeriod.TimePeriod);
//* Initialize Loop Variables
Result.High := vLastDay.High;
vEarlierDay := vLastDay;
while HasUnProcessedDays and NotYetCoveredTimePeriod do
begin
//** Goto Previous Day's Row
if OrderReversed then
LoadRowInto(vEarlierDay) else
LoadPrevRow;
//** Update High
if NotYetCoveredTimePeriod then
Result.High := Max(Result.High, vEarlierDay.High);
end;
Result.Success := FoundAllNeededData;
end;
end.
下面是一个示例CSV。请注意,有时在文件中以相反顺序找到CSV行项目(最新日期优先)。
Date,Open,High,Low,Close,Volume,Adj Close
11/3/2014,12,12.06,11.75,11.98,19700,11.98
11/4/2014,12,12,10.62,11.55,39200,11.55
11/5/2014,11.6,11.85,11.6,11.85,3100,11.85
11/6/2014,11.85,11.85,11.85,11.85,0,11.85
11/7/2014,11.5,11.5,10.35,11,35900,11
11/10/2014,11.12,11.12,11.12,11.12,200,11.12
11/11/2014,11.5,11.5,11.5,11.5,200,11.5
11/12/2014,11.75,11.85,11.15,11.45,3500,11.45
11/13/2014,11.45,11.45,11.45,11.45,0,11.45
11/14/2014,11.45,11.45,11.45,11.45,0,11.45
11/17/2014,11.07,11.28,11.07,11.28,1600,11.28
11/18/2014,11.07,11.74,11.06,11.74,8100,11.74
11/19/2014,11.1,11.5,11,11.5,11600,11.5
11/20/2014,11.1,11.5,11.1,11.5,3100,11.5
11/21/2014,11.49,11.5,11.23,11.25,15100,11.25
11/24/2014,11.25,11.35,11.25,11.25,900,11.25
11/25/2014,11.48,11.5,11.25,11.5,355300,11.5
11/26/2014,11.75,11.75,11.5,11.5,261300,11.5
11/28/2014,11.75,11.8,11.75,11.8,16300,11.8
12/1/2014,11.25,11.8,11.02,11.5,23800,11.5
12/2/2014,11.6,11.6,11.47,11.5,57600,11.5
12/3/2014,11.57,11.75,11.41,11.69,240700,11.69
12/4/2014,11.74,11.75,11.49,11.65,41100,11.65
12/5/2014,11.65,11.85,11.56,11.8,267200,11.8
12/8/2014,11.8,11.85,11.68,11.8,168700,11.8
答案 0 :(得分:0)
首先,尝试将英特尔线程构建模块作为内存管理器。它可以扩展到至少16个核心(我在Why multithreaded memory allocate/deallocate intensive application does not scale with number of threads?
中遇到了类似的问题通常,即使在主线程执行循环中使用英特尔TBB,也要避免动态分配/释放内存。这些操作总是很糟糕。
输入数据可以在不同的硬盘之间分配,如果它们连接到不同的控制器,则更好。如果您事先知道大小,即在线程循环中处理之前,输入数据可以是内存映射。
尽可能优化单线程处理(分析),然后识别未按线程数缩放的部分(线程数作为参数)。这些部分必须重写。 I / O读取操作可以在n *簇块中缓存和/或读取数据。
这些相当一般的建议是基于我在处理Windows 7上的TB大小的TB输入数据(最多12 ht内核和高达128 GB RAM)时收集的经验。