我正在尝试使用pdfBox从hdfs的pdf文件中提取文本。
但是它会引发错误:
"Exception in thread "main" org.apache.spark.SparkException: ...
java.io.FileNotFoundException: /nnAlias:8020/tmp/sample.pdf
(No such file or directory)"
我想念什么?我应该使用PortableDataStream而不是以下内容的字符串部分吗?
val files: RDD[(String, PortableDataStream)]
?
def pdfRead(fileNameFromRDD: (String, PortableDataStream), sparkSession: SparkSession) = {
val file: File = new File(fileNameFromRDD._1.drop(5))
val document = PDDocument.load(file); //It throws an error here.
if (!document.isEncrypted()) {
val stripper = new PDFTextStripper()
val text = stripper.getText(document)
println("Text:" + text)
}
document.close()
}
//This is where I call the above pdf to text converter method.
val files = sparkSession.sparkContext.binaryFiles("hdfs://nnAlias:8020/tmp/sample.pdf")
files.foreach(println)
files.foreach(f => println(f._1))
files.foreach(fileStream => pdfRead(fileStream, sparkSession))