我有一种情况,我收到一个字节数组,可能是一种word文档(有三种可能性):
目前我正在处理1和2。 如果解析1)失败,我目前假设该文档是HTML。 遗憾的是,该文档包含某些转义的HTML标记(作为文本内容),我需要将其转移到处理其他人开发的代码(CMS导入等)。
下面请参阅我使用Apache POI进行的当前实现1),并切换到字符串操作,以满足我对Poetr
和Articl
标签的需求。
如果使用正则表达式更有效地执行此操作,请提供建议。
private void handleWordDocument(Byte[] bytes) {
//Parse DOC file
WordExtractor extractor = null;
try
{
InputStream fis = new ByteArrayInputStream(bytes);
HWPFDocument document = new HWPFDocument(fis);
extractor = new WordExtractor(document);
StringBuilder poetryIntroAndContent = new StringBuilder();
String text = extractor.getText(); //Escaped HTML looks liks HTML as text
attributions.setAttributionsFile(text.getBytes("UTF-8"));
}
catch (Exception exep)
{
LOG.info("Parsing word file failed using Apache POI for word97:" + exep.toString()+". Will attempt to parse file with HTML reader after unescaping html tags.");
try {
String content = new String(attributions.getAttributionsFile(),"UTF-8");
//content = content.replaceAll("<[/]?Poetry(.*?)>","<Poetry$0>").replaceAll("Poetry<","").replaceAll("");
String[] segments = content.split("<");
StringBuilder stringBuilder = new StringBuilder();
String extraDebug = "";
for(String segment: segments){
int endTag = -1;
// //TODO: MBAK: Remove this
// if(System.currentTimeMillis()/1000 < 1481328000){ //December 10, 2016
// extraDebug = "-Testing Translation" + System.currentTimeMillis();
// }
if(segment.length() < 5){ stringBuilder.append(segment+extraDebug); continue;}
String segmentSubstring = segment.substring(0,6);
if(!(
segmentSubstring.toLowerCase().equals("poetry") ||
segmentSubstring.toLowerCase().equals("/poetr") ||
segmentSubstring.toLowerCase().equals("articl") ||
segmentSubstring.toLowerCase().equals("/artic")
)
){
stringBuilder.append(segment); continue;
}
String[] segments2 = segment.split(">");
if(segments2.length < 2){ stringBuilder.append(segment); continue; }
segments2[0] = "<" + segments2[0].replaceAll(""","\"") + ">";
int counter=0;
for(String subsegment: segments2){
stringBuilder.append(subsegment); }
}
attributions.setAttributionsFile(stringBuilder.toString().getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
}