我正在研究一种逻辑。这是我的输入格式。
rootOrgID : subOrgID : tagShortID : Timestamp : ListenerShortID : Highest_Weight
3:2::2:1496745906:362:0.00
3:2:4:1496745907:1901:0.00
3:3:4:1496745907:362:0.00
3:4:4:1496745907:362:0.00
3:3:4:1496745908:362:0.00
3:4:4:1496745908:3878:0.00
3:2:4:1496745909:3878:0.90
3:4:4:1496745909:362:0.60
3:2:2:1496745910:3878:0.90
这是我预期的输出格式
rootOrgID : subOrgID : tagShortID : ToTimestamp : ToListenerShortID : FromTimestamp : FromListenerShortID
3:2:4:1496745907:1901:1496745906:362
3:2:2:1496745909:3878:1496745907:1901
3:2:2:1496745913:718:1496745909:3878
但实际上我得到了像这样的输出
|root_org_id|suborg_id|Tag_short_ID|to_timestamp|to_listner|from_timestamp|from_listner| bof|
| 4| 3| 3| 1496745907| 3878| 0| 0|null|
| 4| 3| 3| 1496745907| 3878| 1| 0|null|
| 4| 3| 3| 1496745907| 718|
| 4| 3| 3| 1496745908| 362| 1| 0|null|
| 4| 3| 3| 1496745912| 718| 0| 0|null|
我将from_timestamp和from_listner视为0。
这是我的代码。
override def evaluate(buffer: Row): Any = {
val in_array = buffer.getAs[WrappedArray[String]](0);
var out_array = Array[String]();
if (in_array.length > 0) {
for (iter <- 0 until in_array.length) {
val data: Array[String] = in_array(iter).split(";");
var to_timestamp = data(0).toInt;
var to_listner = data(2).toInt
var root_org_id = data(3).toInt
var to_suborg = data(4).toInt
var from_suborg = 0
var from_listner = 0
var from_timestamp = 0;
val tagShortID = data(1).toInt
if ((to_timestamp - from_timestamp) > 10L) {
out_array = out_array :+ root_org_id + ";" + to_suborg + ";" + tagShortID + ";" + to_timestamp + ";" + to_listner + ";" + from_timestamp + ";" + from_listner
out_array = out_array :+ root_org_id + ";" + to_suborg + ";" + tagShortID + ";" + to_timestamp + ";" + to_listner + ";" + (from_timestamp + 1) + ";0";
from_listner = to_listner;
from_timestamp = to_timestamp;
from_suborg = to_suborg;
} else {
//Logging only when there is listener change for a tag
if (!to_listner.equals(from_listner)) {
if (to_suborg.equals(from_suborg)) {
out_array = out_array :+ root_org_id + ";" + to_suborg + ";" + tagShortID + ";" + to_timestamp + ";" + to_listner + ";" + from_timestamp + ";" + from_listner;
} else {
out_array = out_array :+ root_org_id + ";" + to_suborg + ";" + tagShortID + ";" + to_timestamp + ";0;" + from_timestamp + ";" + from_listner;
//Ro:Tag:Totime:ToListener:FromTime:FromListener
out_array = out_array :+ root_org_id + ";" + to_suborg + ";" + tagShortID + ";" + to_timestamp + ";" + to_listner + ";0;0";
}
from_listner = to_listner;
from_timestamp = to_timestamp;
from_suborg = to_suborg;
}
}
out_array = out_array :+ root_org_id + ";" + to_suborg + ";" + tagShortID + ";" + to_timestamp + ";0;" + from_timestamp + ";" + from_listner
}
}
out_array
}
我这样称呼这个UDF。
sqlContext.udf.register("cust_agg", new Tdata)
val cust_agg = new Tdata;
averageDF.printSchema()
val processedTagData = averageDF.select($"rootOrgID", $"subOrgID", $"tagShortID", $"Timestamp", $"ListenerShortID", $"RSSI_Weight_avg").distinct()
.orderBy($"Timestamp".asc)
.groupBy($"tagShortID")
.agg(cust_agg($"Timestamp", $"tagShortID", $"ListenerShortID", $"rootOrgID", $"subOrgID", $"RSSI_Weight_avg").as("outcolumn"))
.select(explode($"outcolumn").as("col"))
.select(expr("(split(col, ';'))[0]").cast("string").as("root_org_id"), expr("(split(col, ';'))[1]").cast("string")
.as("suborg_id"), expr("(split(col, ';'))[2]").cast("string").as("Tag_short_ID"), expr("(split(col, ';'))[3]")
.cast("integer").as("to_timestamp"), expr("(split(col, ';'))[4]").cast("string").as("to_listner"), expr("(split(col, ';'))[5]").cast("integer").as("from_timestamp"), expr("(split(col, ';'))[6]").cast("string").as("from_listner"),
expr("(split(col, ';'))[7]").cast("string").as("bof"))
.filter($"to_listner" =!= "0");
processedTagData.show(30)
如果我在from_timestamp和to_timestamp中没有10秒的差异,那么我需要在时间戳上加1并将lister设为0。 如果文件结束,那么接下来没有记录就意味着我们需要做同样的事情。 我不知道我在哪里做错了。任何帮助将不胜感激。