我是Spark world和Job Server的新手
我的代码:
password: {
type: DataTypes.STRING,
allowNull: false,
validate: {notEmpty: true, min: 6}
},
当我使用spark-jobServer推送Jar并执行它时,我在spark-jobserver终端上得到它
package spark.jobserver
import java.nio.ByteBuffer
import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import scala.collection.immutable.Map
import org.apache.cassandra.hadoop.ConfigHelper
import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat
import org.apache.cassandra.utils.ByteBufferUtil
import org.apache.hadoop.mapreduce.Job
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.spark._
import org.apache.spark.SparkContext._
import scala.util.Try
object CassandraCQLTest extends SparkJob{
def main(args: Array[String]) {
val sc = new SparkContext("local[4]", "CassandraCQLTest")
sc.addJar("/extra_data/spark-cassandra-connector/spark-cassandra-connector-java/target/scala-2.10/spark-cassandra-connector-java-assembly-1.3.0-SNAPSHOT.jar");
val config = ConfigFactory.parseString("")
val results = runJob(sc, config)
println("Result is " + "test")
}
override def validate(sc: SparkContext, config: Config): SparkJobValidation = {
Try(config.getString("input.string"))
.map(x => SparkJobValid)
.getOrElse(SparkJobInvalid("No input.string config param"))
}
override def runJob(sc: SparkContext, config: Config): Any = {
val cHost: String = "localhost"
val cPort: String = "9160"
val KeySpace = "retail"
val InputColumnFamily = "ordercf"
val OutputColumnFamily = "salecount"
val job = new Job()
job.setInputFormatClass(classOf[CqlPagingInputFormat])
ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost)
ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort)
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily)
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3")
/** CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "user_id='bob'") */
/** An UPDATE writes one or more columns to a record in a Cassandra column family */
val query = "UPDATE " + KeySpace + "." + OutputColumnFamily + " SET sale_count = ? "
CqlConfigHelper.setOutputCql(job.getConfiguration(), query)
job.setOutputFormatClass(classOf[CqlOutputFormat])
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KeySpace, OutputColumnFamily)
ConfigHelper.setOutputInitialAddress(job.getConfiguration(), cHost)
ConfigHelper.setOutputRpcPort(job.getConfiguration(), cPort)
ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
val casRdd = sc.newAPIHadoopRDD(job.getConfiguration(),
classOf[CqlPagingInputFormat],
classOf[java.util.Map[String,ByteBuffer]],
classOf[java.util.Map[String,ByteBuffer]])
val productSaleRDD = casRdd.map {
case (key, value) => {
(ByteBufferUtil.string(value.get("prod_id")), ByteBufferUtil.toInt(value.get("quantity")))
}
}
val aggregatedRDD = productSaleRDD.reduceByKey(_ + _)
aggregatedRDD.collect().foreach {
case (productId, saleCount) => println(productId + ":" + saleCount)
}
val casoutputCF = aggregatedRDD.map {
case (productId, saleCount) => {
val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId))
val outKey: java.util.Map[String, ByteBuffer] = outColFamKey
var outColFamVal = new ListBuffer[ByteBuffer]
outColFamVal += ByteBufferUtil.bytes(saleCount)
val outVal: java.util.List[ByteBuffer] = outColFamVal
(outKey, outVal)
}
}
casoutputCF.saveAsNewAPIHadoopFile(
KeySpace,
classOf[java.util.Map[String, ByteBuffer]],
classOf[java.util.List[ByteBuffer]],
classOf[CqlOutputFormat],
job.getConfiguration()
)
casRdd.count
}
}
我已经将$ EXTRA_JAR变量添加到我的cassandra-spark-connector-assembly中。
答案 0 :(得分:0)
当您将程序提交到Spark时,您需要包含所有相关jar文件(以逗号分隔的列表)。 假设您的项目结构如下所示。
simpleapp
- src/main/java
- org.apache.spark.examples
- SimpleApp.java
- lib
- dependent.jars (you can put all dependent jars inside lib directory)
- target
- simpleapp.jar (after compiling your source)
所以你可以使用下面的命令。
spark-submit --jars $(echo lib/*.jar | tr ' ' ',' ) --class org.apache.spark.examples.SimpleApp --master local[2] target/simpleapp.jar
此外,您可以使用spark web console查看jar分发,转到您的程序 - >环境并弄清楚由spark引起的jar文件已经存在与否。
答案 1 :(得分:0)
CqlPagingInputFormat位于cassandra-all版本2.0.4中,在其他版本中找不到。 在运行时,你的应用程序正在使cassandra版本大于2.0.4。 你必须将此依赖项添加到ur pom:
<dependency>
<groupId>org.apache.cassandra</groupId>
<artifactId>cassandra-all</artifactId>
<version>2.0.4</version>
得到这堂课。
但我不保证其他事情会正常工作。