我有一个数据框,其列有一串字符串,如'0 1 0 0 0 1 1 0'
它们的长度都相同,但包含不同数量的1和0。我如何在r?
中总结它们答案 0 :(得分:6)
如果它们是字符串,您可以使用scan
创建数字向量,只需sum
:
sum(scan(text = "0 1 0 0 0 1 1 0", quiet = TRUE))
# [1] 3
如果是列,您仍然可以使用scan
。方法可能类似于:
rowSums(matrix(scan(text = as.character(df$V1)), ncol = 8, byrow = TRUE))
将“df$V1
”替换为您的数据实际所在的列,并将“ncol = 8
”替换为您应该拥有的多列。
vapply
+ strsplit
也是不错的选择。这与@karen's approach with sapply
类似,但不是两次循环遍历,只需执行一次:
vapply(strsplit(as.character(x), " ", TRUE), function(y)
sum(as.integer(y)), integer(1L))
但更快的选择是使用“data.table”中的fread
或“iotools”中的dstrstplit
来读取数据,然后使用rowSums()
library(data.table)
rowSums(fread(paste(x, collapse = "\n")))
library(iotools)
rowSums(dstrsplit(x, sep = " ", col_types = rep("integer", 10)))
以下是一些基准测试。首先,要测试的样本数据和函数:
set.seed(1)
mydf <- data.frame(V1 = replicate(10000, paste(sample(c(0, 1), 10, TRUE), collapse = " ")))
library(data.table) # For fun_fread()
library(iotools) # For fun_iotools()
fun_scan <- function(x) {
nc <- length(scan(text = as.character(x[1]), quiet = TRUE))
rowSums(matrix(scan(text = as.character(x), quiet = TRUE), ncol = nc, byrow = TRUE))
}
fun_vapply <- function(x) {
vapply(strsplit(as.character(x), " ", TRUE), function(y) sum(as.integer(y)), integer(1L))
}
fun_sapply <- function(x) {
sapply(as.character(x), function(col) { sum(sapply(strsplit(col, ' '), as.integer))})
}
fun_fread <- function(x) rowSums(fread(paste(x, collapse = "\n")))
fun_iotools <- function(x) {
nc <- length(scan(text = as.character(x[1]), quiet = TRUE))
rowSums(dstrsplit(as.character(x), sep = " ", col_types = rep("integer", nc)))
}
检查一切是否相等:
all.equal(fun_scan(mydf$V1), fun_vapply(mydf$V1))
# [1] TRUE
all.equal(unname(fun_sapply(mydf$V1)), fun_vapply(mydf$V1))
# [1] TRUE
all.equal(fun_fread(mydf$V1), fun_vapply(mydf$V1))
# [1] TRUE
all.equal(fun_fread(mydf$V1), fun_iotools(mydf$V1))
# [1] TRUE
现在,基准:
library(microbenchmark)
res <- microbenchmark(fun_scan(mydf$V1), fun_vapply(mydf$V1),
fun_sapply(mydf$V1), fun_fread(mydf$V1), fun_iotools(mydf$V1))
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun_scan(mydf$V1) 52.071714 54.473772 57.257971 55.869307 58.200922 74.248057 100
# fun_vapply(mydf$V1) 20.477943 22.079925 24.796633 23.613694 26.205056 46.857130 100
# fun_sapply(mydf$V1) 267.655633 281.656375 308.750804 296.128511 319.318740 449.806432 100
# fun_fread(mydf$V1) 8.112898 8.391891 9.154349 8.643196 9.434840 16.967090 100
# fun_iotools(mydf$V1) 4.436515 4.608810 4.967436 4.751517 5.114486 8.690872 100
autoplot(res)
答案 1 :(得分:3)
我愿意:
strTest <- '0 1 0 0 0 1 1 0'
sum(sapply(strsplit(strTest, ' '), as.integer))
考虑到它是一个名为test_col的列,那么:
sapply(test_col, function(col) { sum(sapply(strsplit(col, ' '), as.integer))})