如何从JavaStreamingContext生成JavaPairInputDStream?

时间:2017-12-18 23:32:41

标签: java apache-spark spark-streaming dstream java-pair-rdd

我正在学习Apache Spark流媒体,并尝试从JavaPairInputDStream生成JavaStreamingContext。以下是我的代码:

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
.......    
.......

SparkConf sc = new SparkConf().setAppName("SparkStreamTest").setMaster("local[*]");;
JavaSparkContext jsc = new JavaSparkContext(sc);
JavaStreamingContext jssc = new JavaStreamingContext(jsc, Durations.seconds(3));

List<Tuple2<String, String>> data1 = new ArrayList<Tuple2<String, String>>();
data1.add(new Tuple2<String, String>("K1", "ABC"));
data1.add(new Tuple2<String, String>("K2", "DE"));
data1.add(new Tuple2<String, String>("K1", "F"));
data1.add(new Tuple2<String, String>("K3", "GHI"));

JavaPairRDD<String, String> pairs1 = jssc.sparkContext().parallelizePairs(data1);

List<Tuple2<String, Integer>> data2 = new ArrayList<Tuple2<String, Integer>>();
data2.add(new Tuple2<String, Integer>("K1", 123));
data2.add(new Tuple2<String, Integer>("K2", 456));
data2.add(new Tuple2<String, Integer>("K7", 0));

JavaPairRDD<String, String> pairs2 = jssc.sparkContext().parallelizePairs(data1);

Queue<JavaPairRDD<String, String>> inputQueue = new LinkedList<>(Arrays.asList(pairs1, pairs2));

JavaPairInputDStream<String, String> lines = jssc.queueStream(inputQueue, true);

但是我的应用程序的最后一行抛出了这个异常:

  

queueStream(Queue<JavaRDD<T>>, boolean)类型中的方法JavaStreamingContext不适用于参数(Queue<JavaPairRDD<String,String>>boolean

我不知道如何使用JavaStreamingContext生成JavaPairInputDStream。

1 个答案:

答案 0 :(得分:0)

如果您检查queueStream JavaStreamingContext类的java.util.Queue<JavaRDD<T>>方法,则会接受Queue<JavaRDD<T>作为队列参数。我修改了你的程序以获得queueStream队列。 JavaInputDStream<T>方法返回JavaPairDStream<String,String>类型,以下是将其转换为JavaPairDStream的方法。 JavaPairInputDStream类是import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Queue; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import scala.Tuple2; public class SparkStreamTest { public static void main(String[] args) throws Exception { SparkConf sc = new SparkConf().setAppName("SparkStreamTest").setMaster("local[*]");; JavaStreamingContext jssc = new JavaStreamingContext(sc, Durations.seconds(5)); //first data list List<Tuple2<String, String>> data1 = new ArrayList<Tuple2<String, String>>(); data1.add(new Tuple2<String, String>("K1", "ABC")); data1.add(new Tuple2<String, String>("K2", "DE")); data1.add(new Tuple2<String, String>("K1", "F")); data1.add(new Tuple2<String, String>("K3", "GHI")); //javaRDD1 JavaRDD<Tuple2<String, String>> javaRDD1 = jssc.sparkContext().parallelize(data1); //second data list List<Tuple2<String, String>> data2 = new ArrayList<Tuple2<String, String>>(); data2.add(new Tuple2<String, String>("K1", "123")); data2.add(new Tuple2<String, String>("K2", "256")); data2.add(new Tuple2<String, String>("K7", "0")); //javaRDD2 JavaRDD<Tuple2<String, String>> javaRDD2 = jssc.sparkContext().parallelize(data2); //Queue Queue<JavaRDD<Tuple2<String, String>>> inputQueue = new LinkedList<JavaRDD<Tuple2<String, String>>>(); inputQueue.add(javaRDD1); inputQueue.add(javaRDD2); //stream JavaInputDStream<Tuple2<String, String>> javaDStream = jssc.queueStream(inputQueue, true); JavaPairDStream<String,String> javaPairDStream = javaDStream.mapToPair(tuple -> new Tuple2(tuple._1().toLowerCase(),tuple._2())); //print javaPairDStream.print(); //start jssc.start(); jssc.awaitTermination(); } } 类的超类。希望这会有所帮助。

  - name: test1
    type: type
    path: path