我有一个UTF-8 xml文件,其中包含ÃÂ&xA7;
等代码,用于cedilla等
我已经编写了下面的代码段来删除或替换为可接受的值
1.有更好的方法吗?
2.当我在一些大型XML文件(> 50MB)上运行时,我可能会出现内存不足错误。如果没有更好的方法我如何优化它以避免OOM错误?
<cffile
action="read"
file="#ExpandPath('./xs.xml')#"
variable="myfile"/>
<cfset myfile =ReReplace(myfile,'&##xC2;&##x2013;','.','all')/>
<cfset myfile =ReReplace(myfile,'&##xC2;&##x2019;','''','all')/>
<cfset myfile =ReReplace(myfile,'&##xC2;&##x201D;','"','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x192;&##xC2;&##xA7;','c','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##xA7;','c','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##xA9;','e','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x201A;&##xC2;&##x2022;','(*)','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x192;&##xC2;&##x201A;\?','(*)','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x201A;&##xC2;&##xB7;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x201A;&##xC2;&##x2018;','''','all')/>
<cfset myfile =ReReplace(myfile,' &##xC3;&##x201A;&##xC2;&##x201C;',' "','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##x201C;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##x2122;','''','all')/>
<cfset myfile =ReReplace(myfile,' &##xE2;&##x20AC;&##x153;',' "','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##x153;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##xFFFD; ','" ','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##xFFFD;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x201E;&##xA2;','(TM)','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##xA2;','(*)','all')/>
<cfset myfile =ReReplace(myfile,'&##xEF;&##x201A;&##xA7;','(*)','all')/>
<cfset myfile =ReReplace(myfile,'(&##[^;]*;)','','all')/>
<cffile action="write"
file="#ExpandPath('./xs_new.xml')#"
output="#myfile#"/>
感谢
答案 0 :(得分:1)
使用ColdFusion的文件函数一次处理一行,而不是将整个内容读入内存:
<cfscript>
myfile = FileOpen(ExpandPath('./xs.xml'), "read");
myNewFile = FileOpen(ExpandPath('./xs_new.xml'), "write");
while(NOT FileisEOF(myfile)) {
line = FileReadLine(myfile); // read line
line = ReReplace(line,'&##xC2;&##x2013;','.','all');
line = ReReplace(line,'&##xC2;&##x2019;','''','all');
line = ReReplace(line,'&##xC2;&##x201D;','"','all');
line = ReReplace(line,'&##xC3;&##x192;&##xC2;&##xA7;','c','all');
line = ReReplace(line,'&##xC3;&##xA7;','c','all');
line = ReReplace(line,'&##xC3;&##xA9;','e','all');
line = ReReplace(line,'&##xC3;&##x201A;&##xC2;&##x2022;','(*)','all');
line = ReReplace(line,'&##xC3;&##x192;&##xC2;&##x201A;\?','(*)','all');
line = ReReplace(line,'&##xC3;&##x201A;&##xC2;&##xB7;','-','all');
line = ReReplace(line,'&##xC3;&##x201A;&##xC2;&##x2018;','''','all');
line = ReReplace(line,' &##xC3;&##x201A;&##xC2;&##x201C;',' "','all');
line = ReReplace(line,'&##xE2;&##x20AC;&##x201C;','-','all');
line = ReReplace(line,'&##xE2;&##x20AC;&##x2122;','''','all');
line = ReReplace(line,' &##xE2;&##x20AC;&##x153;',' "','all');
line = ReReplace(line,'&##xE2;&##x20AC;&##x153;','-','all');
line = ReReplace(line,'&##xE2;&##x20AC;&##xFFFD; ','" ','all');
line = ReReplace(line,'&##xE2;&##x20AC;&##xFFFD;','-','all');
line = ReReplace(line,'&##xE2;&##x201E;&##xA2;','(TM)','all');
line = ReReplace(line,'&##xE2;&##x20AC;&##xA2;','(*)','all');
line = ReReplace(line,'&##xEF;&##x201A;&##xA7;','(*)','all');
line = ReReplace(line,'(&##[^;]*;)','','all');
fileWrite(line);
}
FileClose(myfile);
FileClose(myNewFile);
</cfscript>