我正在使用版本3.10.4.8。
library(h2o)
h2o.init(nthreads = -1, max_mem_size = "6g")
data.url <- "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/"
iris.hex <- paste0(data.url, "iris_wheader.csv") %>%
h2o.importFile(destination_frame = "iris.hex")
y <- "class"
x <- setdiff(names(iris.hex), y)
现在,我想计算iris.hex
列class
中的行数,以便暂停使用h2o.group_by
。这是我在阅读文档后尝试的内容:
h2o.group_by(iris.hex, by = list("class"), h2o.nrow)
这会导致以下错误:
Error in is.H2OFrame(x) : object 'group.cols' not found
文档没有提供示例用法,因此我不确定我是否正确调用此函数。
答案 0 :(得分:0)
针对您的具体问题,您可以这样做:
library(h2o)
h2o.init(strict_version_check = F)
iris_wheader = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv"
iris.hex = h2o.importFile(iris_wheader)
iris_count <- h2o.group_by(data = iris.hex, by = "class", nrow('class'),gb.control=list(na.methods="rm"))
你可以看到原始框架和结果:
head(iris.hex)
sepal_len sepal_wid petal_len petal_wid class
1 5.1 3.5 1.4 0.2 Iris-setosa
2 4.9 3.0 1.4 0.2 Iris-setosa
3 4.7 3.2 1.3 0.2 Iris-setosa
4 4.6 3.1 1.5 0.2 Iris-setosa
5 5.0 3.6 1.4 0.2 Iris-setosa
6 5.4 3.9 1.7 0.4 Iris-setosa
iris_count
class nrow
1 Iris-setosa 50
2 Iris-versicolor 50
3 Iris-virginica 50
为将来的版本添加了文档,但这里有一些示例
> library(h2o)
> h2o.init()
# Import the airlines data set and display a summary.
> airlinesURL <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
> airlines.hex <- h2o.importFile(path = airlinesURL, destination_frame = "airlines.hex")
> summary(airlines.hex)
# Find number of flights by airport
> originFlights <- h2o.group_by(data = airlines.hex, by = "Origin", nrow("Origin"), gb.control=list(na.methods="rm"))
> originFlights.R <- as.data.frame(originFlights)
> originFlights.R
Origin nrow_Origin
1 ABE 59
2 ABQ 876
3 ACY 31
...
# Find number of flights per month
> flightsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", nrow("Month"), gb.control=list(na.methods="rm"))
> flightsByMonth.R <- as.data.frame(flightsByMonth)
> flightsByMonth.R
Month nrow_Month
1 1 41979
2 10 1999
# Find the number of flights in a given month based on the origin
> cols <- c("Origin","Month")
> flightsByOriginMonth <- h2o.group_by(data=airlines.hex, by=cols,nrow("NumberOfFlights"), gb.control=list(na.methods="rm")
> flightsByOriginMonth.R <- as.data.frame(flightsByOriginMonth)
> flightsByOriginMonth.R
Origin Month nrow_NumberOfFlights
1 ABE 1 59
2 ABQ 1 846
3 ABQ 10 30
4 ACY 1 31
5 ALB 1 75
...
# Find months with the highest cancellation ratio
> which(colnames(airlines.hex)=="Cancelled")
[1] 22
> cancellationsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", sum("Cancelled"), gb.control=list(na.methods="rm"))
> cancellation_rate <- cancellationsByMonth$sum_Cancelled/flightsByMonth$nrow_Month
> rates_table <- h2o.cbind(flightsByMonth$Month,cancellation_rate)
> rates_table.R <- as.data.frame(rates_table)
> rates_table.R
Month sum_Cancelled
1 1 0.025417471
2 10 0.009504752
# Use group_by with multiple columns. Summarize the destination, arrival delays, and departure delays for an origin
> cols <- c("Dest", "IsArrDelayed", "IsDepDelayed")
> originFlights <- h2o.group_by(data = airlines.hex[c("Origin",cols)], by = "Origin", sum(cols),gb.control = list(na.methods = "ignore", col.names = NULL))
# Note a warning because col.names null
> res <- h2o.cbind(lapply(cols, function(x){h2o.group_by(airlines.hex,by="Origin",sum(x))}))[,c(1,2,4,6)]
> res
Origin sum_Dest sum_IsArrDelayed sum_IsDepDelayed
1 ABE 5884 40 30
2 ABQ 84505 545 370
3 ACY 3131 9 7
4 ALB 3646 49 50
5 AMA 317 4 6
6 ANC 100 0 1