如何在R中使用h2o.group_by?

时间:2017-06-01 19:29:55

标签: r h2o

我正在使用版本3.10.4.8。

library(h2o)

h2o.init(nthreads = -1, max_mem_size = "6g")

data.url <- "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/"

iris.hex <- paste0(data.url, "iris_wheader.csv") %>%
  h2o.importFile(destination_frame = "iris.hex")

y <- "class"
x <- setdiff(names(iris.hex), y)

现在,我想计算iris.hexclass中的行数,以便暂停使用h2o.group_by。这是我在阅读文档后尝试的内容:

h2o.group_by(iris.hex, by = list("class"), h2o.nrow)

这会导致以下错误:

Error in is.H2OFrame(x) : object 'group.cols' not found

文档没有提供示例用法,因此我不确定我是否正确调用此函数。

1 个答案:

答案 0 :(得分:0)

针对您的具体问题,您可以这样做:

library(h2o)
h2o.init(strict_version_check = F)
iris_wheader = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv"

iris.hex = h2o.importFile(iris_wheader)

iris_count <- h2o.group_by(data = iris.hex, by = "class", nrow('class'),gb.control=list(na.methods="rm"))

你可以看到原始框架和结果:

head(iris.hex)

sepal_len sepal_wid petal_len petal_wid       class
1       5.1       3.5       1.4       0.2 Iris-setosa
2       4.9       3.0       1.4       0.2 Iris-setosa
3       4.7       3.2       1.3       0.2 Iris-setosa
4       4.6       3.1       1.5       0.2 Iris-setosa
5       5.0       3.6       1.4       0.2 Iris-setosa
6       5.4       3.9       1.7       0.4 Iris-setosa

iris_count

      class         nrow
1     Iris-setosa   50
2 Iris-versicolor   50
3  Iris-virginica   50

为将来的版本添加了文档,但这里有一些示例

> library(h2o)
> h2o.init()

# Import the airlines data set and display a summary.
> airlinesURL <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
> airlines.hex <- h2o.importFile(path = airlinesURL, destination_frame = "airlines.hex")
> summary(airlines.hex)

# Find number of flights by airport
> originFlights <- h2o.group_by(data = airlines.hex, by = "Origin", nrow("Origin"), gb.control=list(na.methods="rm"))
> originFlights.R <- as.data.frame(originFlights)
> originFlights.R
Origin nrow_Origin
1      ABE          59
2      ABQ         876
3      ACY          31
...

# Find number of flights per month
> flightsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", nrow("Month"), gb.control=list(na.methods="rm"))
> flightsByMonth.R <- as.data.frame(flightsByMonth)
> flightsByMonth.R
Month nrow_Month
1     1      41979
2    10       1999

# Find the number of flights in a given month based on the origin
> cols <- c("Origin","Month")
> flightsByOriginMonth <- h2o.group_by(data=airlines.hex, by=cols,nrow("NumberOfFlights"), gb.control=list(na.methods="rm")
> flightsByOriginMonth.R <- as.data.frame(flightsByOriginMonth)
> flightsByOriginMonth.R
Origin Month nrow_NumberOfFlights
1      ABE     1                   59
2      ABQ     1                  846
3      ABQ    10                   30
4      ACY     1                   31
5      ALB     1                   75
...

# Find months with the highest cancellation ratio
> which(colnames(airlines.hex)=="Cancelled")
[1] 22
> cancellationsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", sum("Cancelled"), gb.control=list(na.methods="rm"))
> cancellation_rate <- cancellationsByMonth$sum_Cancelled/flightsByMonth$nrow_Month
> rates_table <- h2o.cbind(flightsByMonth$Month,cancellation_rate)
> rates_table.R <- as.data.frame(rates_table)
> rates_table.R
Month sum_Cancelled
1     1   0.025417471
2    10   0.009504752

# Use group_by with multiple columns. Summarize the destination, arrival delays, and departure delays for an origin
> cols <- c("Dest", "IsArrDelayed", "IsDepDelayed")
> originFlights <- h2o.group_by(data = airlines.hex[c("Origin",cols)], by = "Origin", sum(cols),gb.control = list(na.methods = "ignore", col.names = NULL))
# Note a warning because col.names null
> res <- h2o.cbind(lapply(cols, function(x){h2o.group_by(airlines.hex,by="Origin",sum(x))}))[,c(1,2,4,6)]
> res
Origin sum_Dest sum_IsArrDelayed sum_IsDepDelayed
1    ABE     5884               40               30
2    ABQ    84505              545              370
3    ACY     3131                9                7
4    ALB     3646               49               50
5    AMA      317                4                6
6    ANC      100                0                1