同时在同一个SparkContext上工作的多个线程会产生意外结果

时间:2016-02-03 08:15:05

标签: java multithreading apache-spark spark-streaming

我开发了一个带有Kafka流媒体和放大器的火花应用程序。 sql基于此Spark example

有3个线程针对相同的SqlContext运行不同的sql:

public final class TestSpark implements Serializable{
    private static final Pattern SPACE = Pattern.compile(" ");
    private static final Logger logger = LoggerFactory.getLogger("TestSpark");
    private static ThreadFactory threadFactory = Executors.defaultThreadFactory();
    private static ExecutorService executorServices = new ThreadPoolExecutor(4, 8, 120, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(10000), threadFactory, new RejectedExecutionHandlerImpl());


  public TestSpark()
  {

  }

  public void run(String brokers, String topics, String master)
  {
    List<String> queryStrings = new ArrayList<String>();
    queryStrings.add("select id, type, key, side, shares, price, time, rank, flag, flag2, comment, source, status, price1, price2 from EventData where key in ('6267359')");
    queryStrings.add("select id, type, key, side, shares, price, time, rank, flag, flag2, comment, source, status, price1, price2 from EventData where key in ('6558484')");
    queryStrings.add("select id, type, key, side, shares, price, time, rank, flag, flag2, comment, source, status, price1, price2 from EventData where key in ('BP3R8Z3')");

    // Create context with 1 second batch interval
    SparkConf sparkConf = new SparkConf().setAppName("TestSpark").setMaster("local[*]");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(",")));
    HashMap<String, String> kafkaParams = new HashMap<String, String>();
    kafkaParams.put("metadata.broker.list", brokers);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
        jssc,
        String.class,
        String.class,
        StringDecoder.class,
        StringDecoder.class,
        kafkaParams,
        topicsSet
    );

    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
          return tuple2._2(); // We do not care about the key here
        }
      });

    lines.foreachRDD(new Function2<JavaRDD<String>, Time, Void>(){
        @Override
        public Void call(JavaRDD<String> rdd, Time time) throws Exception {
            if (!rdd.isEmpty())
            {
                SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context()); 
                for (int i = 0; i < queryStrings.size(); i++)
                {
                    executorServices.execute(new ActionThread(sqlContext, rdd, queryStrings.get(i)));
                }               
            }
            return null;
        }

    });

    // Start the computation
    jssc.start();
    jssc.awaitTermination(); 
  }

  public static void main(String[] args) {
        if (args.length < 3) {
            System.err.println("Usage: DirectKafkaWordCount <brokers> <topics> <master>\n" +
                "  <brokers> is a list of one or more Kafka brokers\n" +
                "  <topics> is a list of one or more kafka topics to consume from\n" +
                "  <master> is spark cluster\n\n");
            System.exit(1);
          }


        TestSpark testSpark = new TestSpark();
        testSpark.run(args[0], args[1], args[2]);
  }
}

/** Lazily instantiated singleton instance of SQLContext */
class JavaSQLContextSingleton {
    static private transient SQLContext instance = null;
    static public SQLContext getInstance(SparkContext sparkContext) {
    if (instance == null) {
      instance = new SQLContext(sparkContext);
    }
    return instance;
  }
}

class ActionThread implements Serializable, Runnable{
    /**
     * 
     */
    private static final long serialVersionUID = 1L;
    private static final Logger logger = LoggerFactory.getLogger("ActionThread");
    SQLContext sqlContext;
    JavaRDD<String> rdd;
    String query;
    ActionThread(SQLContext sqlContext, JavaRDD<String> rdd, String query)
    {
        this.sqlContext = sqlContext;
        this.rdd = rdd;
        this.query = query;
    }
    @Override
    public void run() {
        sqlContext.read().json(rdd).registerTempTable("EventData");
        DataFrame filteredEvent = sqlContext.sql(query);
        List<String> eventList = filteredEvent.toJavaRDD().map(new Function<Row, String>() {
            @Override
            public String call(Row row) 
            { 
                return "ThreadID:" + Thread.currentThread().getId() + " " + row.toString();
            }
          }).collect();
        for (String event: eventList) {
            System.out.println(event);
        }
        filteredEvent.count();
    }
}

class RejectedExecutionHandlerImpl implements RejectedExecutionHandler {

    @Override
    public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
        System.out.println(r.toString() + " is rejected");
    }
}

然而,它给了我意想不到的结果 - 我希望Row中的第一个数字是唯一的,但有重复!通过倾听同一主题,我确信它们是独一无二的。

ThreadID:109 [1,ORDER,6558484,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
ThreadID:107 [0,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
ThreadID:75 [0,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
ThreadID:75 [3,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
ThreadID:107 [2,ORDER,BP3R8Z3,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
ThreadID:75 [3,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]

有没有人知道造成这种情况的原因或我应该这样做?

如果我在'if(!rdd.isEmpty())'下面添加一条印刷线,我得到以下内容:

Rdd:MapPartitionsRDD[7] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[9] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[11] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[21] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[23] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[25] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[27] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[29] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[31] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[33] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[35] at map at TestSpark.java:111
ThreadID:75 [0,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
ThreadID:92 [0,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
Rdd:MapPartitionsRDD[37] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[59] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[61] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[63] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[97] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[99] at map at TestSpark.java:111
ThreadID:75 [1,ORDER,6558484,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
Rdd:MapPartitionsRDD[124] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[163] at map at TestSpark.java:111
Rdd:MapPartitionsRDD[187] at map at TestSpark.java:111
ThreadID:93 [3,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
Rdd:MapPartitionsRDD[220] at map at TestSpark.java:111

用System.out.println替换println(“Rdd.collect:”+ rdd.collect());我得到了以下所以看起来在线程之前事情都很好。也许我应该按顺序执行它,让集群并行执行过滤器。

    Rdd.collect:[{"id":0,"type":"ORDER","key":"6267359","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}, {"id":1,"type":"ORDER","key":"6558484","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":2,"type":"ORDER","key":"BP3R8Z3","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":3,"type":"ORDER","key":"6267359","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":4,"type":"ORDER","key":"6558484","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":5,"type":"ORDER","key":"BP3R8Z3","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,""price2":7.19}]
    Rdd.collect:[{"id":6,"type":"ORDER","key":"6267359","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":7,"type":"ORDER","key":"6558484","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":8,"type":"ORDER","key":"BP3R8Z3","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    Rdd.collect:[{"id":9,"type":"ORDER","key":"6267359","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,""price2":7.19}]
    Rdd.collect:[{"id":10,"type":"ORDER","key":"6558484","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}]
    ThreadID:111 [1,ORDER,6558484,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
    ThreadID:75 [0,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
    ThreadID:112 [0,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
    Rdd.collect:[{"id":11,"type":"ORDER","key":"BP3R8Z3","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}}]
    Rdd.collect:[{"id":12,"type":"ORDER","key":"6267359","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}}]
    Rdd.collect:[{"id":13,"type":"ORDER","key":"6558484","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}}]
    Rdd.collect:[{"id":14,"type":"ORDER","key":"BP3R8Z3","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}}]
    ThreadID:111 [3,ORDER,6267359,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
    ThreadID:112 [2,ORDER,BP3R8Z3,SELL,1000000.0,7.26,2016-01-29T16:36:44.232+03:00,HIGH,Y,_NA_,LMT: Order in Hand,BOAML_AP,null,7.18,7.19]
    Rdd.collect:[{"id":15,"type":"ORDER","key":"6267359","key2":"BP3R8Z3","key3":null,"side":"SELL","shares":1000000.0,"price":7.26,"time":"2016-01-29T16:36:44.232+03:00","rank":"HIGH","flag":"Y","flag2":"_NA_","comment":"LMT: Order in Hand","others":"","others2":"","source":"BOAML_AP","status":null,"price1":7.18,"price2":7.19}}]

0 个答案:

没有答案