我有一个用例,我需要从Cassandra中获取给定时间范围内的所有记录并将其分成30个块,然后进一步聚合每个块,例如让我们假设我一段时间内获取60个记录30分钟的范围。现在我需要分成30块,每分钟2条记录。如果我在1小时的时间范围内获取600条记录,那么30条块将是每2分钟20条记录。如果我在1周的时间范围内获取600条记录,那么30条块将是每5.6小时20条记录,依此类推。
为了实现相同的目的,我编写了一个java代码,它为100k记录提供了3秒的结果。我认为在Cassandra UDF中实现同样的功能会带来性能上的好处,但UDF需要6-7秒(java代码占用的时间的两倍),这让我感到震惊。有人请指导,我离开赛道,下面是我的表结构和java以及UDF代码。
Cassandra表架构
CREATE TABLE transactions_data (
app_name text,
api_name text,
app_id text,
start_time timestamp,
duration int,
end_time timestamp,
node_id text,
request_body text,
request_parameter_name1 text,
request_parameter_name2 text,
request_parameter_name3 text,
request_parameter_name4 text,
request_parameter_name5 text,
request_parameter_value1 text,
request_parameter_value2 text,
request_parameter_value3 text,
request_parameter_value4 text,
request_parameter_value5 text,
response_body text,
response_parameter_name1 text,
response_parameter_name2 text,
response_parameter_name3 text,
response_parameter_name4 text,
response_parameter_name5 text,
response_parameter_value1 text,
response_parameter_value2 text,
response_parameter_value3 text,
response_parameter_value4 text,
response_parameter_value5 text,
responsestatus text,
responsestatuscode text,
transaction_id text,
PRIMARY KEY ((app_name, api_name, app_id), start_time)
);
Java代码
public class SamplingDataJava {
private static Logger logger = LoggerFactory.getLogger(SamplingDataJava.class);
private static String startTime = "2017-03-21 00:00:00.000";
private static String endTime = "2017-04-25 00:00:00.000";
private final String SELECT_STATEMENT = "select start_time,duration from transactions_data "
+ " where app_name='app_name-abc' and api_name='api_name-1' "
+ " and app_id='app_id-xyz' " + " AND start_time>='"
+ startTime + "' AND start_time<='" + endTime + "' ";
private Cluster cluster;
private Session session;
private String Host = "localhost";
public SamplingDataJava() throws IOException {
// this.query=query;
logger.info("Using CQL3 Writer");
cluster = Cluster.builder().addContactPoints(Host)
.withSocketOptions(new SocketOptions().setConnectTimeoutMillis(2000000)).build();
session = cluster.connect();
}
private class Result {
double duration;
int count;
Result(double duration, int count) {
this.duration = duration;
this.count = count;
}
@Override
public String toString() {
return "Result [duration=" + duration + ", count=" + count + "]";
}
}
public void hashSampling(long interval,long initTime) throws IOException {
HashMap<Long, Result> agg = new HashMap<>();
ResultSet rs = session.execute(SELECT_STATEMENT);
int i = 0;
for (com.datastax.driver.core.Row row : rs) {
i++;
Long hashcode = Math.abs((row.getTimestamp("start_time").getTime() - initTime) / interval);
Result hasResult = agg.get(hashcode);
if (hasResult == null) {
hasResult = new Result(row.getInt("duration"), 1);
} else {
hasResult.duration = (hasResult.duration + row.getInt("duration"));
hasResult.count++;
}
agg.put(hashcode, hasResult);
}
System.out.println("total number of records " + i);
Long code=0L;
while (code<30) {
System.out.println(" code "+agg.get(code));
code++;
}
}
public void close() {
cluster.close();
session.close();
}
public static void main(String[] args) throws IOException {
long beginTime = System.currentTimeMillis();
SamplingDataJava cqp = new SamplingDataJava();
long onlyQueryTime = System.currentTimeMillis();
DateTimeFormatter readPattern = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS");
DateTime sTime = readPattern.parseDateTime(startTime);
DateTime eTime = readPattern.parseDateTime(endTime);
long interval = (eTime.getMillis() - sTime.getMillis()) / 30;
System.out.println("start end time :" + eTime.getMillis() + " " + sTime.getMillis());
cqp.hashSampling(interval,sTime.getMillis());
System.out.println("total time without open close " + (System.currentTimeMillis() - onlyQueryTime));
cqp.close();
System.out.println("total time " + (System.currentTimeMillis() - beginTime));
}
}
UDF代码
CREATE OR REPLACE FUNCTION txn_group_count_and_sum( txn map<bigint,
frozen<tuple<int,int>>>, start_time bigint, duration int , sample_size
bigint, begin_time bigint )
RETURNS NULL ON NULL INPUT
RETURNS map<bigint, frozen<tuple<int,int>>>
LANGUAGE java AS '
Long hashcode = (start_time - begin_time)/sample_size;
TupleValue tupleValue = txn.get(hashcode);
if (tupleValue == null) {
com.datastax.driver.core.TupleType tupleType =
com.datastax.driver.core.TupleType.of(
com.datastax.driver.core.ProtocolVersion.
NEWEST_SUPPORTED, com.datastax.driver.core.CodecRegistry.DEFAULT_INSTANCE,
com.datastax.driver.core.DataType.cint(),
com.datastax.driver.core.DataType.cint());
tupleValue = tupleType.newValue(1, duration );
}else{
tupleValue.setInt(0, tupleValue.getInt(0) + 1);
tupleValue.setInt(1, tupleValue.getInt(1) + duration);
}
txn.put(hashcode, tupleValue);
return txn; ' ;
CREATE OR REPLACE AGGREGATE group_count_and_sum(bigint, int ,bigint, bigint)
SFUNC txn_group_count_and_sum
STYPE map<bigint, frozen<tuple<int,int>>>
INITCOND {};
查询
select group_count_and_sum(toUnixTimestamp(start_time),duration,100800000,1490054400000) from transactions_data
where app_name='app_name-abc' and api_name='api_name-1'
and app_id='app_id-xyz'
AND start_time>='2017-03-21 00:00:00.000' AND start_time<='2017-04-25 00:00:00.000';
注意: -
100800000 = (end_time - start_time)/30
1490054400000 = millisecond of 2017-03-21 00:00:00.000