我在hadoop Distcp面临以下问题,我们非常感谢任何建议或帮助。
我正在尝试将数据从Google Cloud平台复制到Amazon S3
1)当我们有多个文件要从源复制到目的地时(这很好)
val sourcefile : String = "gs://XXXX_-abc_account2621/abc_account2621_click_20170616*.csv.gz [Multiple files to copy (we have * in the file name)]
Output: S3://S3bucketname/xxx/xxxx/clientid=account2621/date=2017-08-18/
Files in above path
abc_account2621_click_2017061612_20170617_005852_572560033.csv.gz
abc_account2621_click_2017061616_20170617_045654_572608350.csv.gz
abc_account2621_click_2017061622_20170617_103107_572684922.csv.gz
abc_account2621_click_2017061623_20170617_120235_572705834.csv.gz
2)当我们只有一个文件要从源复制到目的地(问题)时
val sourcefile : String = "gs://XXXX_-abc_account2621/abc_account2621_activity_20170618_20170619_034412_573362513.csv.gz
Output:S3://S3bucketname/xxx/xxxx/clientid=account2621/
Files in above path
date=2017-08-18 (Directory replace with file content and it doesn't have file type)
代码:
def main(args: Array[String]): Unit = {
val Array(environment,customer, typesoftables, clientid, filedate) = args.take(5)
val S3Path: String = customer + "/" + typesoftables + "/" + "clientid=" + clientid + "/" + "date=" + filedate + "/"
val sourcefile : String = "gs://XXXX_-abc_account2621//abc_account2621_activity_20170618_20170619_034412_573362513.csv.gz"
val destination: String = "s3n://S3bucketname/" + S3Path
println(sourcefile)
println(destination)
val filepaths: Array[String] = Array(sourcefile, destination)
executeDistCp(filepaths)
}
def executeDistCp(filepaths : Array[String]) {
val conf: Configuration = new Configuration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl","com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
conf.set("google.cloud.auth.service.account.enable", "true")
conf.set("fs.gs.project.id", "XXXX-XXXX")
conf.set("google.cloud.auth.service.account.json.keyfile","/tmp/XXXXX.json")
conf.set("fs.s3n.awsAccessKeyId", "XXXXXXXXXXXX")
conf.set("fs.s3n.awsSecretAccessKey","XXXXXXXXXXXXXX")
conf.set("mapreduce.application.classpath","$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*
,/usr/lib/hadoop-lzo/lib/*,/usr/share/aws/emr/emrfs/conf,/usr/share/aws/emr/emrfs/lib/*,/usr/share/aws/emr/emrfs/auxlib/*,/usr/share/aws/emr/lib/*,/usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,/usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,/usr/share/aws/emr/cloudwatch-sink/lib/*,/usr/share/aws/aws-java-sdk/*,/tmp/gcs-connector-latest-hadoop2.jar")
conf.set("HADOOP_CLASSPATH","$HADOOP_CLASSPATH:/tmp/gcs-connector-latest-hadoop2.jar")
val outputDir: Path = new Path(filepaths(1))
outputDir.getFileSystem(conf).delete(outputDir, true)
val distCp: DistCp = new DistCp(conf,null)
ToolRunner.run(distCp, filepaths)
}
}
答案 0 :(得分:0)
通过添加以下代码,上述问题已修复
代码
val makeDir: Path = new Path(filepaths(1))
makeDir.getFileSystem(conf).mkdirs(makeDir)