public class TikaParserImpl implements TikaParser
{
private AutoDetectParser parser;
public TikaParserImpl()
{
parser = new AutoDetectParser();
}
@Override
public String parse(InputStream in) throws IOException, SAXException, TikaException
{
TransformerHandler handler;
StringWriter writer = new StringWriter();
try
{
handler = getTransformerHandler(writer);
}
catch (TransformerConfigurationException e)
{
throw new RuntimeException(e);
}
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
parser.parse(in, handler, metadata, context);
String[] metanames = metadata.names();
for (int i = 0; i < metanames.length; i++) {
String[] metavalues = metadata.getValues(metanames[i]);
for (int j = 0; j < metavalues.length; j++) {
LOG.warn(metanames[i] + ": " + metavalues[j]);
}
}
String output = writer.toString();
return output;
}
/**
* Get a TransformerHandler.
*
* The handler is used to convert SAX events to a XHTML document.
*
* Since this object might not be multi-threaded, it is re-created at each
* parsing.
*
* @param writer
* @return TransformerHandler
* @throws TransformerConfigurationException
*/
private TransformerHandler getTransformerHandler(StringWriter writer) throws TransformerConfigurationException
{
// Even the factory might not be multi-threaded, so we don't take chance
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.getTransformer().setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
handler.setResult(new StreamResult(writer));
return handler;
}
}
我是tika的新手,我正在编写的这段代码并非我写的,所以我也是新手。这里的期望是XHTML是从这个东西解析生成的。对于某些文档(例如:Office的xls格式),一切都很好。对于其他人(例如:Office的xlsx格式),我没有在XHTML中获取元数据,但它仍然存在于元数据的实例中(如您所见,我记录了元数据类的内容)
不幸的是,我不能使用更新版本的Tika,因为这会破坏我所坚持的Nutch版本。