Scala - 比较2个hdfs文件

时间:2018-06-14 15:55:42

标签: scala hadoop hdfs

我已经进步到下面,使用scala比较2个文件。我正在尝试打印不常见的行,但readHDFSFile上的foreach似乎无法正常工作。对此有任何帮助。

import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import scala.util.{Failure, Success, Try}

object DRCompareHDFSFiles {
  def main(args: Array[String]): Unit = {
    println("DBMigrate Main")
    val hdfs = FileSystem.get(new URI("hdfs://localhost:8020/"), new Configuration())
    val path1 = new Path(args(0))
    val path2 = new Path(args(1))
    readHDFSFile(hdfs, path1, path2)
  }

//  Accept a parameter which implements a close method
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
    try {
      f(resource)
    } finally {
      resource.close()
    }

  def readHDFSFile(hdfs: FileSystem, path1: Path, path2: Path): Option[Stream[(String,String)]] = {
      Try(
        using(hdfs.open(path1))(readFileStream))
    } match {
    case Success(result) => {
        Try(
          using(hdfs.open(path2))(readFileStream))
        } match {
        case Success(result1) => {
          val notEqualLines = for {
            line1 <- result
            line2 <- result1
            if (line1 != line2)
          } yield (line1,line2)
          Some(notEqualLines)
        }
        case Failure(ex) =>
          println(s"Could not read file $path2, detail ${ex.getClass.getName}:${ex.getMessage}")
          None
        }
    case Failure(ex) =>
      println(s"Could not read file $path1, detail ${ex.getClass.getName}:${ex.getMessage}")
      None
    }

  def readFileStream(fis: FSDataInputStream)= {
    val inFile = new BufferedReader(new InputStreamReader(fis))
    def readLines = Stream.cons(inFile.readLine(), Stream.continually(inFile.readLine()))
    readLines
  }
}

在readHDFSFile中获取共同行后,如何获取行。 foreach似乎不起作用 感谢

0 个答案:

没有答案