我有一个大文本文件(大约100MB),每行用CR字符分隔,而不是CRLF。
我尝试使用TStringList.LoadFromFile()或ReadLn(F,..)逐行读取此文本文件,但这两种方法都要求这些行由CRLF分隔。
您是否有任何有效且快速的方法来阅读此类文本文件?
感谢。
PS:我使用的是Delphi 7。
答案 0 :(得分:8)
这应该这样做。
将文本文件读入内存流。
然后用内容填充字符串列表。
textList.Text
接受CR
,LF
和CRLF
的任意组合形成一条线。
function MemoryStreamToString( M : TMemoryStream) : string;
begin
SetString( Result,PChar(M.Memory),M.Size div SizeOf(Char)); // Works in all Delphi versions
end;
var
memStream : TMemoryStream;
textList : TStringList;
begin
textList := TStringList.Create;
try
memStream:= TMemoryStream.Create;
try
memStream.LoadFromFile('mytextfile.txt');
textList.Text := MemoryStreamToString( memStream); // any combination of CR,LF,CRLF interprets as a line
finally
memStream.Free;
end;
// do something with textList
finally
textList.Free;
end;
end;
答案 1 :(得分:4)
我一直想要解决这个问题,所以我写了一个,它是JvCsvDataSet的一部分。我的问题是:
如果你想这样做,你可以写下这些:
procedure TForm1.Button1Click(Sender: TObject);
var
ts:TTextStream;
s:String;
begin
ts := TTextStream.Create('c:\temp\test.txt', fm_OpenReadShared);
try
while not ts.Eof do begin
s := ts.ReadLine;
doSomethingWith(s);
end;
finally
ts.Free;
end;
end;
好。那看起来很容易吗?它是。它甚至有一个文件模式标志(注意那里的读共享选项?)。现在您只需要Teh Codez For TTextStream,它们就在这里:
unit textStreamUnit;
{$M+}
{$R-}
{
textStreamUnit
This code is based on some of the content of the JvCsvDataSet written by Warren Postma, and others,
licensed under MOZILLA Public License.
}
interface
uses
Windows,
Classes,
SysUtils;
const
cQuote = #34;
cLf = #10;
cCR = #13;
{ File stream mode flags used in TTextStream }
{ Significant 16 bits are reserved for standard file stream mode bits. }
{ Standard system values like fmOpenReadWrite are in SysUtils. }
fm_APPEND_FLAG = $20000;
fm_REWRITE_FLAG = $10000;
{ combined Friendly mode flag values }
fm_Append = fmOpenReadWrite or fm_APPEND_FLAG;
fm_OpenReadShared = fmOpenRead or fmShareDenyWrite;
fm_OpenRewrite = fmOpenReadWrite or fm_REWRITE_FLAG;
fm_Truncate = fmCreate or fm_REWRITE_FLAG;
fm_Rewrite = fmCreate or fm_REWRITE_FLAG;
TextStreamReadChunkSize = 8192; // 8k chunk reads.
resourcestring
RsECannotReadFile = 'Cannot read file %';
type
ETextStreamException = class(Exception);
{$ifndef UNICODE}
RawByteString=AnsiString;
{$endif}
TTextStream = class(TObject)
private
FStream: TFileStream; // Tried TJclFileStream also but it was too slow! Do NOT use JCL streams here. -wpostma.
FFilename: string;
FStreamBuffer: PAnsiChar;
FStreamIndex: Integer;
FStreamSize: Integer;
FLastReadFlag: Boolean;
procedure _StreamReadBufInit;
public
function ReadLine: RawByteString; { read a string, one per line, wow. Text files. Cool eh?}
procedure Append;
procedure Rewrite;
procedure Write(const s: RawByteString); {write a string. wow, eh? }
procedure WriteLine(const s: RawByteString); {write string followed by Cr+Lf }
procedure WriteChar(c: AnsiChar);
procedure WriteCrLf;
//procedure Write(const s: string);
function Eof: Boolean; {is at end of file? }
{ MODE is typically a fm_xxx constant thatimplies a default set of stream mode bits plus some extended bit flags that are specific to this stream type.}
constructor Create(const FileName: string; Mode: DWORD = fm_OpenReadShared; Rights: Cardinal = 0); reintroduce; virtual;
destructor Destroy; override;
function Size: Int64; //override; // sanity
{ read-only properties at runtime}
property Filename: string read FFilename;
property Stream: TFileStream read FStream; { Get at the underlying stream object}
end;
implementation
// 2 gigabyte file limit workaround:
function GetFileSizeEx(h: HFILE; FileSize: PULargeInteger): BOOL; stdcall; external Kernel32;
procedure TTextStream.Append;
begin
Stream.Seek(0, soFromEnd);
end;
constructor TTextStream.Create(const FileName: string; Mode: DWORD; Rights: Cardinal);
var
IsAppend: Boolean;
IsRewrite: Boolean;
begin
inherited Create;
FFilename := FileName;
FLastReadFlag := False;
IsAppend := (Mode and fm_APPEND_FLAG) <> 0;
IsRewrite := (Mode and fm_REWRITE_FLAG) <> 0;
FStream := TFileStream.Create(Filename, {16 lower bits only}Word(Mode), Rights);
//Stream := FStream; { this makes everything in the base class actually work if we inherited from Easy Stream}
if IsAppend then
Self.Append // seek to the end.
else
Stream.Position := 0;
if IsRewrite then
Rewrite;
_StreamReadBufInit;
end;
destructor TTextStream.Destroy;
begin
if Assigned(FStream) then
FStream.Position := 0; // avoid nukage
FreeAndNil(FStream);
FreeMem(FStreamBuffer); // Buffered reads for speed.
inherited Destroy;
end;
function TTextStream.Eof: Boolean;
begin
if not Assigned(FStream) then
Result := False
//Result := True
else
Result := FLastReadFlag and (FStreamIndex >= FStreamSize);
//Result := FStream.Position >= FStream.Size;
end;
{ TTextStream.ReadLine:
This reads a line of text, normally terminated by carriage return and/or linefeed
but it is a bit special, and adapted for CSV usage because CR/LF characters
inside quotes are read as a single line.
This is a VERY PERFORMANCE CRITICAL function. We loop tightly inside here.
So there should be as few procedure-calls inside the repeat loop as possible.
}
function TTextStream.ReadLine: RawByteString;
var
Buf: array of AnsiChar;
n: Integer;
QuoteFlag: Boolean;
LStreamBuffer: PAnsiChar;
LStreamSize: Integer;
LStreamIndex: Integer;
procedure FillStreamBuffer;
begin
FStreamSize := Stream.Read(LStreamBuffer[0], TextStreamReadChunkSize);
LStreamSize := FStreamSize;
if LStreamSize = 0 then
begin
if FStream.Position >= FStream.Size then
FLastReadFlag := True
else
raise ETextStreamException.CreateResFmt(@RsECannotReadFile, [FFilename]);
end
else
if LStreamSize < TextStreamReadChunkSize then
FLastReadFlag := True;
FStreamIndex := 0;
LStreamIndex := 0;
end;
begin
{ Ignore linefeeds, read until carriage return, strip carriage return, and return it }
SetLength(Buf, 150);
n := 0;
QuoteFlag := False;
LStreamBuffer := FStreamBuffer;
LStreamSize := FStreamSize;
LStreamIndex := FStreamIndex;
while True do
begin
if n >= Length(Buf) then
SetLength(Buf, n + 100);
if LStreamIndex >= LStreamSize then
FillStreamBuffer;
if LStreamIndex >= LStreamSize then
Break;
Buf[n] := LStreamBuffer[LStreamIndex];
Inc(LStreamIndex);
case Buf[n] of
cQuote: {34} // quote
QuoteFlag := not QuoteFlag;
cLf: {10} // linefeed
if not QuoteFlag then
Break;
cCR: {13} // carriage return
begin
if not QuoteFlag then
begin
{ If it is a CRLF we must skip the LF. Otherwise the next call to ReadLine
would return an empty line. }
if LStreamIndex >= LStreamSize then
FillStreamBuffer;
if LStreamBuffer[LStreamIndex] = cLf then
Inc(LStreamIndex);
Break;
end;
end
end;
Inc(n);
end;
FStreamIndex := LStreamIndex;
SetString(Result, PAnsiChar(@Buf[0]), n);
end;
procedure TTextStream.Rewrite;
begin
if Assigned(FStream) then
FStream.Size := 0;// truncate!
end;
function TTextStream.Size: Int64; { Get file size }
begin
if Assigned(FStream) then
GetFileSizeEx(FStream.Handle, PULargeInteger(@Result)) {int64 Result}
else
Result := 0;
end;
{ Look at this. A stream that can handle a string parameter. What will they think of next? }
procedure TTextStream.Write(const s: RawByteString);
begin
Stream.Write(s[1], Length(s)); {The author of TStreams would like you not to be able to just write Stream.Write(s). Weird. }
end;
procedure TTextStream.WriteChar(c: AnsiChar);
begin
Stream.Write(c, SizeOf(AnsiChar));
end;
procedure TTextStream.WriteCrLf;
begin
WriteChar(#13);
WriteChar(#10);
end;
procedure TTextStream.WriteLine(const s: RawByteString);
begin
Write(s);
WriteCrLf;
end;
procedure TTextStream._StreamReadBufInit;
begin
if not Assigned(FStreamBuffer) then
begin
//FStreamBuffer := AllocMem(TextStreamReadChunkSize);
GetMem(FStreamBuffer, TextStreamReadChunkSize);
end;
end;
end.
答案 2 :(得分:0)
如果我没弄错,你需要在从文件中读取文本之前设置stringlist的LineBreak属性。
....
const
CR = #13;
LF = #10;
LFCR = #10#13;
begin
MyStringList.LineBreak:= CR;
MyStringList.LoadFromFile(.....
请参阅:http://docwiki.embarcadero.com/VCL/XE2/en/Classes.TStrings.LineBreak
不是100%肯定Delphi 7支持这个(刚检查过,D2007这样做,我怀疑D7也会这样)。
答案 3 :(得分:0)
看看这是否有帮助:
https://stackoverflow.com/a/2957614/1046041
初看起来,似乎你可以将代码中的EOL字符更改为其他内容然后#13#10。
它还逐行解析(您可以将其用作缓冲区),而不是将整个文件加载到内存中(这可能是100MB +文件的问题)。