Question

我有一个用例，我需要从Cassandra中获取给定时间范围内的所有记录并将其分成30个块，然后进一步聚合每个块，例如让我们假设我一段时间内获取60个记录30分钟的范围。现在我需要分成30块，每分钟2条记录。如果我在1小时的时间范围内获取600条记录，那么30条块将是每2分钟20条记录。如果我在1周的时间范围内获取600条记录，那么30条块将是每5.6小时20条记录，依此类推。

为了实现相同的目的，我编写了一个java代码，它为100k记录提供了3秒的结果。我认为在Cassandra UDF中实现同样的功能会带来性能上的好处，但UDF需要6-7秒（java代码占用的时间的两倍），这让我感到震惊。有人请指导，我离开赛道，下面是我的表结构和java以及UDF代码。

Cassandra表架构

CREATE TABLE transactions_data (
app_name text,
api_name text,
app_id text,
start_time timestamp,
duration int,
end_time timestamp,
node_id text,
request_body text,
request_parameter_name1 text,
request_parameter_name2 text,
request_parameter_name3 text,
request_parameter_name4 text,
request_parameter_name5 text,
request_parameter_value1 text,
request_parameter_value2 text,
request_parameter_value3 text,
request_parameter_value4 text,
request_parameter_value5 text,
response_body text,
response_parameter_name1 text,
response_parameter_name2 text,
response_parameter_name3 text,
response_parameter_name4 text,
response_parameter_name5 text,
response_parameter_value1 text,
response_parameter_value2 text,
response_parameter_value3 text,
response_parameter_value4 text,
response_parameter_value5 text,
responsestatus text,
responsestatuscode text,
transaction_id text,
PRIMARY KEY ((app_name, api_name, app_id), start_time)
);

Java代码

public class SamplingDataJava {

private static Logger logger = LoggerFactory.getLogger(SamplingDataJava.class);

private static String startTime = "2017-03-21 00:00:00.000";
private static String endTime = "2017-04-25 00:00:00.000";

private final String SELECT_STATEMENT = "select start_time,duration from transactions_data "
        + " where app_name='app_name-abc' and api_name='api_name-1' "
        + " and app_id='app_id-xyz' " + " AND start_time>='"
        + startTime + "' AND start_time<='" + endTime + "' ";

private Cluster cluster;
private Session session;

private String Host = "localhost";


public SamplingDataJava() throws IOException {

    // this.query=query;
    logger.info("Using CQL3 Writer");

    cluster = Cluster.builder().addContactPoints(Host)
            .withSocketOptions(new SocketOptions().setConnectTimeoutMillis(2000000)).build();

    session = cluster.connect();

}

private class Result {
    double duration;
    int count;

    Result(double duration, int count) {
        this.duration = duration;
        this.count = count;
    }

    @Override
    public String toString() {
        return "Result [duration=" + duration + ", count=" + count + "]";
    }

}

public void hashSampling(long interval,long initTime) throws IOException {

    HashMap<Long, Result> agg = new HashMap<>();
    ResultSet rs = session.execute(SELECT_STATEMENT);

    int i = 0;
    for (com.datastax.driver.core.Row row : rs) {

        i++;
        Long hashcode = Math.abs((row.getTimestamp("start_time").getTime() - initTime) / interval);

        Result hasResult = agg.get(hashcode);

        if (hasResult == null) {
            hasResult = new Result(row.getInt("duration"), 1);
        } else {
            hasResult.duration = (hasResult.duration + row.getInt("duration"));
            hasResult.count++;
        }

        agg.put(hashcode, hasResult);

    }

    System.out.println("total number of records " + i);

    Long code=0L;
    while (code<30) {
       System.out.println(" code "+agg.get(code));
       code++;
    }

}

public void close() {
    cluster.close();
    session.close();
}

public static void main(String[] args) throws IOException {

    long beginTime = System.currentTimeMillis();
    SamplingDataJava cqp = new SamplingDataJava();

    long onlyQueryTime = System.currentTimeMillis();
    DateTimeFormatter readPattern = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS");

    DateTime sTime = readPattern.parseDateTime(startTime);

    DateTime eTime = readPattern.parseDateTime(endTime);

    long interval = (eTime.getMillis() - sTime.getMillis()) / 30;

    System.out.println("start end time :" + eTime.getMillis() + " " + sTime.getMillis());

    cqp.hashSampling(interval,sTime.getMillis());
    System.out.println("total time without open close " + (System.currentTimeMillis() - onlyQueryTime));

    cqp.close();

    System.out.println("total time " + (System.currentTimeMillis() - beginTime));
}

}

UDF代码

CREATE OR REPLACE FUNCTION txn_group_count_and_sum( txn map<bigint, 
frozen<tuple<int,int>>>, start_time bigint, duration int , sample_size 
bigint, begin_time bigint )
RETURNS NULL ON NULL INPUT
RETURNS map<bigint, frozen<tuple<int,int>>>
LANGUAGE java AS '

Long hashcode = (start_time - begin_time)/sample_size;
TupleValue tupleValue = txn.get(hashcode);

if (tupleValue == null) {
com.datastax.driver.core.TupleType tupleType = 

com.datastax.driver.core.TupleType.of( 
com.datastax.driver.core.ProtocolVersion.
NEWEST_SUPPORTED, com.datastax.driver.core.CodecRegistry.DEFAULT_INSTANCE, 
com.datastax.driver.core.DataType.cint(), 
com.datastax.driver.core.DataType.cint());
tupleValue = tupleType.newValue(1, duration );
}else{

tupleValue.setInt(0, tupleValue.getInt(0) + 1);
tupleValue.setInt(1, tupleValue.getInt(1) + duration);

}

 txn.put(hashcode, tupleValue);
 return txn; ' ;


CREATE OR REPLACE AGGREGATE group_count_and_sum(bigint, int ,bigint, bigint) 
SFUNC txn_group_count_and_sum 
STYPE map<bigint, frozen<tuple<int,int>>>
INITCOND {};

查询

 select group_count_and_sum(toUnixTimestamp(start_time),duration,100800000,1490054400000) from transactions_data 
         where app_name='app_name-abc' and api_name='api_name-1' 
         and app_id='app_id-xyz'  
         AND start_time>='2017-03-21 00:00:00.000' AND start_time<='2017-04-25 00:00:00.000';

注意： -

 100800000 = (end_time - start_time)/30
 1490054400000 = millisecond of 2017-03-21 00:00:00.000

为什么Cassandra UDF性能比java代码更差

0 个答案: