我想将函数应用于R数据表对象,该对象比较两列中的值并返回结果。以下是数据表X的示例:
X <- as.data.table(list(POSITION=c(1,4,5,9,24,36,42,56),
FIRST=c("A","BB","AA","B","AAA","B","A,B"),
SECOND=c("B","AA","B","AAA","BBB","AB,ABB","B,A")))
POSITION FIRST SECOND
1: 1 A B
2: 4 BB AA
3: 5 AA B
4: 9 B AAA
5: 24 AAA BBB
6: 36 B AB,ABB
7: 42 A,B B,A
8: 56 A B
我想对“FIRST”和“SECOND”列中的数据进行以下逻辑比较,以创建“结果”列:
SAME = length of FIRST and SECOND are both one character
BLOCK = Character length of FIRST and SECOND are the same,
but greater than one, and not mixed (i.e. no comma)
LESS = SECOND has fewer characters, but neither is mixed
MORE = SECOND has more characters, but neither is mixed
MIXED = either firs of second contains a comma
因此,期望的结果如下:
POSITION FIRST SECOND RESULTS
1 A B SAME
4 BB AA BLOCK
5 A B,A MIXED
9 AA B LESS
24 B AAA MORE
28 BBB A,B MIXED
36 AAA BBB BLOCK
42 B AB,ABB MIXED
56 A,B B,A MIXED
所以以下工作,但是对于有400万行的文件来说速度很慢!
X[, RESULT := ifelse(nchar(FIRST)+nchar(SECOND)==2,"SAME",
ifelse(grepl(",", FIRST) | grepl(",",SECOND), "MIXED",
ifelse(nchar(FIRST) > nchar(SECOND), "LESS",
ifelse(nchar(FIRST) < nchar(SECOND), "MORE","BLOCK")))]
但它确实给出了你期望的结果:
POSITION FIRST SECOND RESULT
1: 1 A B SAME
2: 4 BB AA BLOCK
3: 5 AA B LESS
4: 9 B AAA MORE
5: 24 AAA BBB BLOCK
6: 36 B AB,ABB MIXED
7: 42 A,B B,A MIXED
8: 56 A B SAME
我实际上还有几个条件需要测试,其中一些条件变得更加复杂,只有字符计数。而不是长期的ifelse语句,是否可以应用一个函数,将两列作为输入?例如:
checkType <- function(x) {
if(nchar(x$FIRST)+nchar(x$SECOND)==2) {
type <- "SNP"
} else if(!grepl(",", x$SECOND) & !grepl(",",x$FIRST) & (nchar(x$FIRST) > nchar(x$SECOND))) {
type <- "LESS"
} else if(!grepl(",", x$SECOND) & !grepl(",",x$FIRST) & (nchar(x$FIRST) < nchar(x$SECOND))) {
type <- "MORE"
} else if (!grepl(",", x$SECOND) & !grepl(",",x$FIRST) & (nchar(x$FIRST) == nchar(x$SECOND)) & nchar(x$SECOND)>1) {
type <-"BLOCK"
} else {
type <- "MIXED"
}
return(type)
}
> checkType(X[1,])
[1] "SAME"
for(i in 1:nrow(X)) X[i, RESULT := checkType(X[i,])]
因此,虽然上述工作,但显然不是使用data.table运行事物的最佳方式。但是,我尝试了lapply并申请,但都没有工作:
X[, RESULT3 := lapply(.SD, checkType)]
Error in x$FIRST : $ operator is invalid for atomic vectors
nchar(x$FIRST)
FUN(X[[1L]], ...)
lapply(.SD, checkType)
eval(expr, envir, enclos)
eval(jsub, SDenv, parent.frame())
`[.data.table`(X, , `:=`(RESULT3, lapply(.SD, checkType)))
X[, `:=`(RESULT3, lapply(.SD, checkType))]
与apply(.SD,1,checkType)相同的结果。我正在尝试通过应用函数来做到这一点吗?
答案 0 :(得分:1)
请注意,您的代码生成的数据表(下面第一行,从上面的代码段中粘贴)与不一样与其下方“所需结果”框中显示的数据表相同。
然而,这可能实际上更快,并且肯定会更容易理解。它会产生一个我认为与你的规则一致的结果。
X <- as.data.table(list(POSITION=c(1,4,5,9,24,36,42,56),
FIRST=c("A","BB","AA","B","AAA","B","A,B"),
SECOND=c("B","AA","B","AAA","BBB","AB,ABB","B,A")))
X$mixed <- grepl(',',X$FIRST) | grepl(',',X$SECOND)
X$nf <- nchar(X$FIRST)
X$ns <- nchar(X$SECOND)
X$RESULT = ""
setkey(X,nf,ns)
X[J(1,1),RESULT:="SAME"]
X[!mixed & nf==ns & nf>1 & ns>1]$RESULT <- "BLOCK"
X[!mixed & nf > ns]$RESULT <- "LESS"
X[!mixed & nf < ns]$RESULT <- "MORE"
X[(mixed)]$RESULT <- "MIXED"
setkey(X,POSITION)
您的类别并不相互排斥,因此我认为这些规则是按顺序应用的(例如FIRST=","
和SECOND=","
的内容?
另外,我认为你对MORE和LESS的定义是一样的。
答案 1 :(得分:1)
所以@Frank和@jlhoward的答案都给出了理想的结果,并且比我最初的尝试要快得多。但是,从这些答案来看,这种方法(createResult1)比具有1,000,000行的文件快4倍:
createResult1 <- function(X) {
X[,`:=`(
cf=nchar(FIRST),
cs=nchar(SECOND),
mf=grepl(',',FIRST),
ms=grepl(',',SECOND)
)]
X[cf==1&cs==1, RESULT:="SAME"]
X[cf > cs, RESULT:="LESS"]
X[cf < cs, RESULT:="MORE"]
X[cf==cs & cs>1, RESULT:="BLOCK"]
X[(mf)|(ms), RESULT:="MIXED"]
X[,c('cf','cs','mf','ms'):=NULL]
return(X)
}
createResult2 <- function(X) { #@Frank
X[,`:=`(
cf=nchar(FIRST),
cs=nchar(SECOND),
mf=grepl(',',FIRST),
ms=grepl(',',SECOND)
)][,RESULT:=ifelse(cf==1&cs==1,"SAME",
ifelse(mf | ms, "MIXED",
ifelse(cf > cs, "LESS",
ifelse(cf < cs, "MORE","BLOCK"))))
][
,c('cf','cs','mf','ms'):=NULL
]
return(X)
}
createResult3 <- function(X) { #@jlhoward
X$mixed <- grepl(',',X$FIRST) | grepl(',',X$SECOND)
X$nf <- nchar(X$FIRST)
X$ns <- nchar(X$SECOND)
X$RESULT = ""
setkey(X,nf,ns)
X[J(1,1),RESULT:="SAME"]
X[!mixed & nf==ns & nf>1 & ns>1]$RESULT <- "BLOCK"
X[!mixed & nf > ns]$RESULT <- "LESS"
X[!mixed & nf < ns]$RESULT <- "MORE"
X[(mixed)]$RESULT <- "MIXED"
X[,c('nf','ns','mixed'):=NULL]
setkey(X,POSITION)
return(X)
}
创建与上面相同的数据表,但有1,000,000行
X <- as.data.table(list(POSITION=rep(c(1,4,5,9,24,36,42,56),1000000),
FIRST=rep(c("A","BB","AA","B","AAA","B","A,B"),1000000),
SECOND=rep(c("B","AA","B","AAA","BBB","AB,ABB","B,A"),1000000)))
Y <- copy(X)
Z <- copy(X)
结果如下:
> system.time(X <- createResult1(X))
user system elapsed
4.06 0.05 4.12
> system.time(Y <- createResult2(Y))
user system elapsed
18.53 0.36 18.94
> system.time(Z <- createResult2(Z))
user system elapsed
18.63 0.29 18.97
> identical(X,Y)
[1] TRUE
> identical(X,Z)
[1] TRUE