给定一个包含列的数据框:
我想得到length2列相对于length1列的百分比。所以像df $ length2 / df $ lenght1 * 100。 请参阅以下最小示例:
> df=data.frame(length1=c("10","12","14"))
> df$length2=list("2,3,4","4,5,3","3,2,6")
> df
length1 length2
1 10 2,3,4
2 12 4,5,3
3 14 3,2,6
> dfresult=df
> dfresult$resultInPercent=list("20,30,40","33,41,25","21,14,42")
> dfresult
length1 length2 resultInPercent
1 10 2,3,4 20,30,40
2 12 4,5,3 33,41,25
3 14 3,2,6 21,14,42
我无法让它工作,我的方法是:
dfresult=apply(df, 1, function(x)
{
lapply(lapply(lapply(x$length2,strsplit,split=","),as.numeric),function(y)
{
round(as.numeric(y)/as.numeric(x$length1)*100)
}
)
}
)
lapply中的错误(lapply(x $ length2,strsplit,split =“,”),as.numeric) :( list)对象无法强制输入'double'
我放弃了,感觉我的所作所为是复杂的。
答案 0 :(得分:7)
另一个想法:
library(dplyr)
library(tidyr)
df %>%
separate_rows(length2) %>%
mutate_all(funs(as.numeric(as.character(.)))) %>%
group_by(length1) %>%
summarise(l2 = list(length2),
l3 = list(round(100 * length2 / length1)))
给出了:
## A tibble: 3 x 3
# length1 l2 l3
# <dbl> <list> <list>
#1 10 <dbl [3]> <dbl [3]>
#2 12 <dbl [3]> <dbl [3]>
#3 14 <dbl [3]> <dbl [3]>
这会将结果存储在list
中,以便于进一步操作:
#Observations: 3
#Variables: 3
#$ length1 <dbl> 10, 12, 14
#$ l2 <list> [<2, 3, 4>, <4, 5, 3>, <3, 2, 6>]
#$ l3 <list> [<20, 30, 40>, <33, 42, 25>, <21, 14, 43>]
答案 1 :(得分:5)
这是使用data.table
library(data.table)
temp <- round(setDT(df)[, tstrsplit(length2, ",", fixed = TRUE, type.convert = TRUE)] /
as.numeric(levels(df$length1))[df$length1] * 100)
df[, resultInPercent := do.call(paste, c(temp, sep = ","))]
df
# length1 length2 resultInPercent
# 1: 10 2,3,4 20,30,40
# 2: 12 4,5,3 33,42,25
# 3: 14 3,2,6 21,14,43
一些基准
library(data.table)
library(microbenchmark)
library(dplyr)
library(tidyr)
set.seed(123)
bigdf <- data.frame(length1 = sample(1e4),
length2 = I(replicate(1e4, "2,3,4", simplify = FALSE)))
bigdf2 <- copy(bigdf)
Steve <- function(df){ # changed `list` to `toStirng` so all outputs match
df %>%
separate_rows(length2) %>%
mutate_all(funs(as.numeric(as.character(.)))) %>%
group_by(length1) %>%
summarise(res = toString(round(100 * length2 / length1)))
}
David <- function(df) {
temp <- round(setDT(df)[, tstrsplit(length2, ",", fixed = TRUE, type.convert = TRUE)] /
as.numeric(levels(df$length1))[df$length1] * 100)
df[, resultInPercent := do.call(paste, c(temp, sep = ","))]
df
}
akrun <- function(df) {
df["resultInPercent "] <-
mapply(function(x,y) toString(round(x/y)),
lapply(strsplit(as.character(df$length2), ","), as.numeric),
as.numeric(as.character(df$length1))/100)
df
}
microbenchmark(Steve(bigdf), David(bigdf2), akrun(bigdf))
# expr min lq mean median uq max neval cld
# Steve(bigdf) 475.62891 488.96441 501.77668 497.47626 507.9581 571.5748 100 c
# David(bigdf2) 17.78974 18.16284 18.77208 18.36107 18.6625 29.8744 100 a
# akrun(bigdf) 145.98749 149.93839 154.36653 151.82216 154.4384 218.4145 100 b
答案 2 :(得分:2)
由于列为factor
类,我们将&#39; length2&#39;在通过分隔符character
转换为,
类后,将list
中的元素转换为numeric
,使用mapply
来划分list
的元素使用&#39; length1&#39;,vector
的相应round
元素输出并转换为单个字符串(toString
是paste(., collapse=", ")
的包装器<) / p>
mapply(function(x,y) toString(round(x/y)),
lapply(strsplit(as.character(df$length2), ","), as.numeric),
as.numeric(as.character(df$length1))/100)
#[1] "20, 30, 40" "33, 42, 25" "21, 14, 43"