早上好。我正在用scala创建一个spark应用程序。七 您必须在分布式节点环境中运行共享库(libfreebayes.so)。 libfreebayes.so运行一个用c ++编写的名为freebayes的外部程序。但是,会发生以下错误: java.lang.UnsatisfiedLinkError:本机库/usr/lib/libfreebayes.so已经加载到另一个类加载器中
必须在逐个分区的基础上完成CreateFreebayesInput方法。为每个分区加载libfreebayes.so是否有问题?此应用程序在spark本地模式下正常工作。如何让它在纱线群集模式下工作?因为这个问题我无法入睡。帮我。 : - <
import java.io.{File, FileReader, FileWriter, PrintWriter}
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
object sparkFreebayes {
def main(args : Array[String]) {
val conf = new SparkConf().setAppName("sparkFreebayes")
val sc = new SparkContext(conf)
val appId = sc.applicationId
val appName = sc.appName
val referencePath="/mnt/data/hg38.fa"
val input = sc.textFile(args(0))
val executerNum = args(1).toInt
val outputDir = args(2)
val inputDir = "/mnt/partitionedSam/"
val header = input.filter(x => x.startsWith("@"))
val body = input.filter(x => !x.startsWith("@"))
val partitioned = body.map{x => (x.split("\t")(2),x)}.repartitionAndSortWithinPartitions(new tmpPartitioner(executerNum)).persist()
val cHeader = header.collect.mkString("\n")
val sorted = partitioned.map( x => (x._2) )
CreateFreebayesInput(sorted)
def CreateFreebayesInput(sortedRDD : RDD[String]) = {
sortedRDD.mapPartitionsWithIndex { (idx, iter) =>
val tmp = iter.toList
val outputPath = outputDir+"/"+appId+"_Out_"+idx+".vcf"
val tmp2 = List(cHeader) ++ tmp
val samString = tmp2.mkString("\n")
val jni = new FreeBayesJni
val file = new File(inputDir + "partitioned_" + idx + ".sam")
val fw = new FileWriter(file)
fw.write(samString)
fw.close()
if (file.exists() || file.length()!=0) {
System.loadLibrary("freebayes")
val freebayesParameter = Array("-f","/mnt/data/hg38.fa",file.getPath,"-file",outputPath)
jni.freebayes_native(freebayesParameter.length,freebayesParameter)
//runFreebayes(file.getPath, referencePath, outputPath )
}
tmp2.productIterator
}
}.collect()
}
}
FreeBayesJni类是下一个:
class FreeBayesJni {
@native def freebayes_native(argc: Int, args: Array[String]): Int;
}
我的spark-submit命令:
spark-submit --class partitioning --master yarn-cluster ScalaMvnProject.jar FullOutput_sorted.sam 7 /mnt/OutVcf
谢谢。