我们经常用另一个“好”字符替换文件中不需要的字符。
界面是:
procedure cleanfileASCII2(vfilename: string; vgood: integer; voutfilename: string);
用我们可能调用的空格替换所有非desirables, cleanfileASCII2(original.txt,32,cleaning.txt)
问题在于这需要相当长的时间。在那儿 比展示更好的方法吗?
procedure cleanfileASCII2(vfilename: string; vgood: integer; voutfilename:
string);
var
F1, F2: file of char;
Ch: Char;
tempfilename: string;
i,n,dex: integer;
begin
//original
AssignFile(F1, vfilename);
Reset(F1);
//outputfile
AssignFile(F2,voutfilename);
Rewrite(F2);
while not Eof(F1) do
begin
Read(F1, Ch);
//
n:=ord(ch);
if ((n<32)or(n>127))and (not(n in [10,13])) then
begin // bad char
if vgood<> -1 then
begin
ch:=chr(vgood);
Write(F2, Ch);
end
end
else //good char
Write(F2, Ch);
end;
CloseFile(F2);
CloseFile(F1);
end;
答案 0 :(得分:6)
问题与你如何处理缓冲区有关。内存传输是任何操作中最昂贵的部分。在这种情况下,您将逐字节查看文件。通过更改为块读或缓冲读取,您将实现速度的大幅提升。请注意,正确的缓冲区大小取决于您所在的位置。对于网络文件,由于TCP / IP数据包大小的限制,您会发现极大的缓冲区可能效率较低。即使这对于gigE的大包也变得有点模糊,但是,一如既往,最好的结果就是对它进行基准测试。
为了方便起见,我从标准读取转换为文件流。你可以用blockread轻松做同样的事情。在这种情况下,我拿了一个15MB的文件并在你的例行程序中运行它。在本地文件上执行操作需要131,478ms。使用1024缓冲区,需要258ms。
procedure cleanfileASCII3(vfilename: string; vgood: integer; voutfilename:string);
const bufsize=1023;
var
inFS, outFS:TFileStream;
buffer: array[0..bufsize] of byte;
readSize:integer;
tempfilename: string;
i: integer;
begin
if not FileExists(vFileName) then exit;
inFS:=TFileStream.Create(vFileName,fmOpenRead);
inFS.Position:=0;
outFS:=TFileStream.Create(vOutFileName,fmCreate);
while not (inFS.Position>=inFS.Size) do
begin
readSize:=inFS.Read(buffer,sizeof(buffer));
for I := 0 to readSize-1 do
begin
n:=buffer[i];
if ((n<32)or(n>127)) and (not(n in [10,13])) and (vgood<>-1) then
buffer[i]:=vgood;
end;
outFS.Write(buffer,readSize);
end;
inFS.Free;
outFS.Free;
end;
答案 1 :(得分:2)
一些改进:
这是一个未经测试的攻击(现在没有编译器在我面前):
procedure cleanfileASCII2(vfilename: string; vgood: integer; voutfilename: string);
var
f1, f2: File;
table: array[Char] of Char;
index, inBuffer: Integer;
buffer: array[0..2047] of Char;
c: Char;
begin
for c := #0 to #31 do
table[c] := ' ';
for c := #32 to #127 do
table[c] := c;
for c := #128 to #255 do
table[c] := ' ';
table[#10] := #10; // exception to spaces <32
table[#13] := #13; // exception to spaces <32
AssignFile(F1, vfilename);
Reset(F1, 1);
AssignFile(F2,voutfilename);
Rewrite(F2, 1);
while not Eof(F1) do
begin
BlockRead(f1, buffer, SizeOf(buffer), inBuffer);
for index := 0 to inBuffer - 1 do
buffer[index] := table[buffer[index]];
BlockWrite(f2, buffer, inBuffer);
end;
Close(f2);
Close(f1);
end;
答案 2 :(得分:1)
您可以缓冲输入和输出,这样您就可以将一大块字符(甚至整个文件,如果它不是太大)读入数组,然后处理数组,然后将整个数组写入输出文件。
在大多数情况下,磁盘IO是瓶颈,如果你可以做更少的大读取而不是很多小读取,它会更快。
答案 3 :(得分:1)
缓冲是正确的方法。我修改了你的代码以查看差异:
procedure cleanfileASCII2(vfilename: string; vgood: integer; voutfilename:
string);
var
F1, F2: file;
NumRead, NumWritten: Integer;
Buf: array[1..2048] of Char;
Ch: Char;
i, n: integer;
begin
AssignFile(F1, vfilename);
Reset(F1, 1); // Record size = 1
AssignFile(F2, voutfilename);
Rewrite(F2, 1); // Record size = 1
repeat
BlockRead(F1, Buf, SizeOf(Buf), NumRead);
for i := 1 to NumRead do
begin
Ch := Buf[i];
//
n := ord(ch);
if ((n<32)or(n>127))and (not(n in [10,13])) then
begin // bad char
if vgood <> -1 then
begin
ch := chr(vgood);
Buf[i] := Ch;
end
//else //good char
//Write(F2, Ch);
end;
end;
BlockWrite(F2, Buf, NumRead, NumWritten);
until (NumRead = 0) or (NumWritten <> NumRead);
CloseFile(F1);
CloseFile(F2);
end;
答案 4 :(得分:0)
我是这样做的,确保文件I / O在处理之前一次完成。代码可以用于更新unicode,但它可以处理讨厌的文本字符,例如null,并为您提供TStrings功能。 BRI
procedure TextStringToStringsAA( AStrings : TStrings; const AStr: Ansistring);
// A better routine than the stream 'SetTextStr'.
// Nulls (#0) which might be in the file e.g. from corruption in log files
// do not terminate the reading process.
var
P, Start, VeryEnd: PansiChar;
S: ansistring;
begin
AStrings.BeginUpdate;
try
AStrings.Clear;
P := Pansichar( AStr );
VeryEnd := P + Length( AStr );
if P <> nil then
while P < VeryEnd do
begin
Start := P;
while (P < VeryEnd) and not CharInSet(P^, [#10, #13]) do
Inc(P);
SetString(S, Start, P - Start);
AStrings.Add(string(S));
if P^ = #13 then Inc(P);
if P^ = #10 then Inc(P);
end;
finally
AStrings.EndUpdate;
end;
end;
procedure TextStreamToStrings( AStream : TStream; AStrings : TStrings );
// An alternative to AStream.LoadFromStream
// Nulls (#0) which might be in the file e.g. from corruption in log files
// do not terminate the reading process.
var
Size : Integer;
S : Ansistring;
begin
AStrings.BeginUpdate;
try
// Make a big string with all of the text
Size := AStream.Size - AStream.Position;
SetString( S, nil, Size );
AStream.Read(Pointer(S)^, Size);
// Parse it
TextStringToStringsAA( AStrings, S );
finally
AStrings.EndUpdate;
end;
end;
procedure LoadStringsFromFile( AStrings : TStrings; const AFileName : string );
// Loads this strings from a text file
// Nulls (#0) which might be in the file e.g. from corruption in log files
// do not terminate the reading process.
var
ST : TFileStream;
begin
ST := TFileStream.Create( AFileName, fmOpenRead + fmShareDenyNone);
// No attempt is made to prevent other applications from reading from or writing to the file.
try
ST.Position := 0;
AStrings.BeginUpdate;
try
TextStreamToStrings( ST, AStrings );
finally
AStrings.EndUpdate;
end;
finally
ST.Free;
end;
end;
答案 5 :(得分:0)
不知道在哪里进行优化。
你应该使用Sampling Profiler(delphitools.info)来了解瓶颈在哪里。它很容易使用。
在循环之前预先计算vgood chr转换。
此外,您不需要一些转换:Ord()和Chr()。始终使用'Ch'变量。
if not (ch in [#10, #13, #32..#127]) then
答案 6 :(得分:0)
可能最简单的方法是:
投票此帖子+1如果有帮助请