final String lowBoundary = args[0];
final String highBoundary = args[1];
SparkConf conf = new SparkConf()
JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(2));
HashSet<String> topicSet = new HashSet<String>();
HashMap<String, String> kafkaMap = new HashMap<>();
kafkaMap.put("metadata.broker.list", "localhost:9092");
JavaPairInputDStream<String, String> stream = KafkaUtils
.createDirectStream(ssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaMap, topicSet);
// Data is generated from TPC-H, so every row, or every Kafka message has the data's timestamp as key
// and all the attributes separated by |
JavaPairDStream<String, String[]> tuples = stream
.mapToPair(new PairFunction<Tuple2<String, String>, String, String[]>() {
public Tuple2<String, String[]> call(Tuple2<String, String> tuple) {
String[] split = tuple._2.split(Pattern.quote("|"));
return new Tuple2<String, String[]>(tuple._1, split);
// I'll omit the saving to HBase code as it's too long and useless
JavaDStream<Integer> filterAndCount = tuples.filter(new Function<Tuple2<String, String[]>, Boolean>() {
public Boolean call(Tuple2<String, String[]> tuple) {
if (Long.parseLong(tuple._2[0]) > Long.parseLong(lowBoundary)
&& Long.parseLong(tuple._2[0]) < Long.parseLong(highBoundary)) {
return true;
} else {
return false;
}).map(new Function<Tuple2<String, String[]>, Integer>() {
public Integer call(Tuple2<String, String[]> tuple) throws Exception {
return tuple._2[15].split(" ").length;
}).reduce(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer a, Integer b) throws Exception {
return a + b;
// The lowest timestamp is used to pe provided as the higher boundary for the batch job and is
// also going to be used as the key for the HBase table in which the result from this microbatch is stored
JavaDStream<Long> lowestTimestamp = stream.map(new Function<Tuple2<String, String>, Long>() {
public Long call(Tuple2<String, String> tuple) throws Exception {
return Long.parseLong(tuple._1);
}).reduce(new Function2<Long, Long, Long>() {
public Long call(Long a, Long b) throws Exception {
if (a > b) {
return b;
} else {
return a;
// After calculation of the smallest timestamp I need to check if batch is over and if so start it with a scan limited to the
// minimum timestamp provided by this.