当我尝试在列上对数据框进行分组时,请尝试查找每个分组的最小值groupbyDatafram.min('timestampCol')
它出现在非数字列上无法执行此操作。那我怎样才能正确过滤groupby上的最小(最早)日期?
我正在从postgresql S3实例传输数据帧,因此已经配置了数据。
答案 0 :(得分:11)
只需直接执行聚合,而不是使用var config = require('./config');
帮助程序:
var config = require('./config'),
cluster = require('./components/cluster'),
http = require('http'),
...
...
https = require('https');
cluster.start(function() {
if (config.get('app:http:enabled')) {
var httpServer = http.createServer(app);
httpServer.listen(config.get('app:http:port'), config.get('app:http:host'),
function () {
winston.info('App listening at http://%s:%s', config.get('app:http:host'), config.get('app:http:port'));
});
}
if (config.get('app:https:enabled')) {
var httpsServer = https.createServer({
key: fs.readFileSync(path.join(__dirname, 'certificates', config.get('app:https:certificate:key'))),
cert: fs.readFileSync(path.join(__dirname, 'certificates', config.get('app:https:certificate:cert')))
}, app);
httpsServer.listen(config.get('app:https:port'), config.get('app:https:host'),
function () {
winston.info('App listening at https://%s:%s', config.get('app:https:host'), config.get('app:https:port'));
});
}
});
与min
不同,它适用于任何import org.apache.spark.sql.functions.min
val sqlContext: SQLContext = ???
import sqlContext.implicits._
val df = Seq((1L, "2016-04-05 15:10:00"), (1L, "2014-01-01 15:10:00"))
.toDF("id", "ts")
.withColumn("ts", $"ts".cast("timestamp"))
df.groupBy($"id").agg(min($"ts")).show
// +---+--------------------+
// | id| min(ts)|
// +---+--------------------+
// | 1|2014-01-01 15:10:...|
// +---+--------------------+
类型。