由于我想使用apache Tika从.doc
文件中提取数据,因此我运行以下代码:
import org.apache.tika.io.TikaInputStream
import org.apache.tika.parser.{AutoDetectParser, ParseContext}
import org.apache.spark._
import scala.collection.mutable
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.input.PortableDataStream
import org.apache.tika.metadata._
import org.apache.tika.parser._
import org.apache.tika.sax.WriteOutContentHandler
import java.io._
import java.io.FileInputStream
import java.io.{File, PrintWriter}
import java.nio.file.{Path,Paths}
object ParserData {
def main(args: Array[String]) {
val filesPath = "Path/Input/files"
val conf = new SparkConf().setMaster("local[2]").setAppName("ParserData")
val sc = new SparkContext(conf)
val fileData = sc.binaryFiles(filesPath)
fileData.foreach( x => tikaFunc(x))
}
def tikaFunc (a: (String, PortableDataStream)) = {
val file : File = new File(a._1.drop(5))
val myparser : AutoDetectParser = new AutoDetectParser()
val stream : InputStream = new FileInputStream(file)
val handler : WriteOutContentHandler = new WriteOutContentHandler(-1)
val metadata : Metadata = new Metadata()
val context : ParseContext = new ParseContext()
myparser.parse(stream, handler, metadata, context)
stream.close
val content = handler.toString()
println(handler.toString())
println("------------------------------------------------")
val count = Option(new File("Path/Output/files").list).map(_.size).getOrElse(0) //to count the number of Input files
for (i <- 0 until count){
val writer = new PrintWriter(new FileOutputStream(s"Path/Output/files$i.txt"),true)
writer.println(content)
writer.close()
}
}
}
我需要的输出是在不同的.doc
文件中每个.txt
的提取数据
我的代码仅给我.doc
中的每个.txt
文件之一的输出
生成的文件
在IDE中,使用输出的Iam正确,但是生成的文件不正确。
如果有人对我如何获得所需的输出有任何想法,请帮助!
非常感谢!