我正在尝试使用apache tika从只能包含图像和其他内容的pdf或word文件中提取文本。如何从文本中仅提取文本? 我在tika中需要什么依赖项? 这是我编写的Java代码:
包secondp;
import java.io.File;
import org.apache.tika.Tika;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.Tika;
import org.xml.sax.SAXException;
public class trial {
public static void main(final String[] args) {
try {
System.out.println(trial.convert("test.pdf"));
} catch (final Exception e) {
e.printStackTrace();
}
}
public static String convert(final String fileName) throws IOException, SAXException, TikaException {
try(final FileInputStream inputstream = new FileInputStream(new File(fileName))) {
final BodyContentHandler handler = new BodyContentHandler();
new PDFParser().parse(inputstream, handler, new Metadata(), new ParseContext());
return handler.toString().trim();
}
}
}