通过Spark groupBy数据帧找到时间戳的最小值

时间:2016-04-05 13:05:01

标签: sql scala apache-spark apache-spark-sql

当我尝试在列上对数据框进行分组时,请尝试查找每个分组的最小值groupbyDatafram.min('timestampCol')它出现在非数字列上无法执行此操作。那我怎样才能正确过滤groupby上的最小(最早)日期?

我正在从postgresql S3实例传输数据帧,因此已经配置了数据。

1 个答案:

答案 0 :(得分:11)

只需直接执行聚合,而不是使用var config = require('./config');帮助程序:

var config = require('./config'),
    cluster = require('./components/cluster'),
    http = require('http'),
    ...
    ...
    https = require('https');

cluster.start(function() {
    if (config.get('app:http:enabled')) {
        var httpServer = http.createServer(app);
        httpServer.listen(config.get('app:http:port'), config.get('app:http:host'),
            function () {
                winston.info('App listening at http://%s:%s', config.get('app:http:host'), config.get('app:http:port'));
            });
    }

    if (config.get('app:https:enabled')) {
        var httpsServer = https.createServer({
            key: fs.readFileSync(path.join(__dirname, 'certificates', config.get('app:https:certificate:key'))),
            cert: fs.readFileSync(path.join(__dirname, 'certificates', config.get('app:https:certificate:cert')))
        }, app);
        httpsServer.listen(config.get('app:https:port'), config.get('app:https:host'),
            function () {
                winston.info('App listening at https://%s:%s', config.get('app:https:host'), config.get('app:https:port'));
            });
    }
});

min不同,它适用于任何import org.apache.spark.sql.functions.min val sqlContext: SQLContext = ??? import sqlContext.implicits._ val df = Seq((1L, "2016-04-05 15:10:00"), (1L, "2014-01-01 15:10:00")) .toDF("id", "ts") .withColumn("ts", $"ts".cast("timestamp")) df.groupBy($"id").agg(min($"ts")).show // +---+--------------------+ // | id| min(ts)| // +---+--------------------+ // | 1|2014-01-01 15:10:...| // +---+--------------------+ 类型。