我的df包含一系列具有相似名称的列,每三列分组一次,类似于:
>df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))
>names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")
> df
AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3
1 0 0 0 0 0 0 0 3 3
2 1 1 1 1 1 1 8 1 1
3 4 6 1 1 1 1 1 1 3
4 5 5 1 1 1 1 9 1 4
这基本上显示了4名患者的每种检查类型(AA,BB,CC)的3种不同测量值(1,2,3)。实际上,我有一个庞大的数据集,其中有3个测量值,可以对2,000名患者进行10次不同的检查我想添加一个新的疾病分类列如下: 如果每次检查至少一次测量的分数(XX1,XX2,XX2,其中XX = AA或BB或CC)> 4,则患者患有该疾病。所以新的数据集看起来像这样:
>
AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3疾病
1 0 0 0 0 0 0 0 3 3 0
2 1 1 1 1 1 1 8 1 1 1
3 4 6 1 1 1 1 1 1 3 1
4 5 5 1 1 1 1 9 1 4 1
答案 0 :(得分:1)
df <- structure(list(AA1 = c(0, 1, 4, 5), AA2 = c(0, 1, 3, 3), AA3 = c(0,
1, 1, 1), BB1 = c(0, 1, 1, 1), BB2 = c(0, 1, 1, 1), BB3 = c(0,
1, 1, 1), CC1 = c(0, 8, 1, 9), CC2 = c(3, 1, 1, 1), CC3 = c(3,
1, 3, 4)), .Names = c("AA1", "AA2", "AA3", "BB1", "BB2", "BB3",
"CC1", "CC2", "CC3"), row.names = c(NA, -4L), class = "data.frame")
indx <- gsub("\\d+","", colnames(df)) #deletes the numberrs from colnames
lst <- split(seq_len(ncol(df)), indx) # seq_len(ncol(df)) #gives the sequence of column and split it by indx
lapply(lst, function(i) df[,i]) #subset the data by column number
lapply(lst, function(i) do.call(`pmax`, df[,i])>=4) #gives maximum value for each row and we test if that values is `>=4`
rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))
#[1] 0 1 1 2
!!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4)) #double negation
#[1] FALSE TRUE TRUE TRUE
(!!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))) +0
#[1] 0 1 1 1
一次性输入上述代码
df$DISEASE <- (!!rowSums(sapply(split(seq_len(ncol(df)),
gsub("\\d+","", colnames(df))), function(i)
( do.call(`pmax`, df[,i])>=4))))+0
df
# AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3 DISEASE
# 1 0 0 0 0 0 0 0 3 3 0
# 2 1 1 1 1 1 1 8 1 1 1
# 3 4 3 1 1 1 1 1 1 3 1
# 4 5 3 1 1 1 1 9 1 4 1
或者
colnames(df) <- gsub("([A-Za-z]+)(\\d+)", "\\1_\\2", colnames(df)) #created `-` between alphabets and numbers for the colnames
df$id <- 1:nrow(df) #created an id column
df1 <- reshape(df, idvar="id", varying=grep("[A-Z]", colnames(df)), direction="long", sep="_") #used reshape to get the columns starting with `AA` in one column, similarly for `BB` and `CC`
#Split by the reshaped dataset by id and look for any values that are >=4 in each list element
df$DISEASE <- sapply(split(df1[,-(1:2)], df$id), function(x) any(x >=4)) +0
df[,-10]
# AA_1 AA_2 AA_3 BB_1 BB_2 BB_3 CC_1 CC_2 CC_3 DISEASE
# 1 0 0 0 0 0 0 0 3 3 0
# 2 1 1 1 1 1 1 8 1 1 1
# 3 4 3 1 1 1 1 1 1 3 1
# 4 5 3 1 1 1 1 9 1 4 1
答案 1 :(得分:1)
df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))
names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")
使用您的数据格式的解决方案:
rowSums(df > 4) > 0
#[1] TRUE TRUE FALSE TRUE
这使用了在计算它们的总和时逻辑值被强制转换为0和1的事实。
但整洁的数据通常更可取:
df$id <- rownames(df)
library(reshape2)
DF <- melt(df, id.var="id")
DF$exam <- gsub("[[:digit:]+]", "", DF$variable)
DF$meas <- as.numeric(gsub("[[:alpha:]+]", "", DF$variable))
head(DF)
# id variable value exam meas
#1 1 AA1 0 AA 1
#2 2 AA1 1 AA 1
#3 3 AA1 4 AA 1
#4 4 AA1 5 AA 1
#5 1 AA2 0 AA 2
#6 2 AA2 1 AA 2
#Is patient diseased?
library(plyr)
ddply(DF, .(id), summarize, disease = any(value > 4))
# id disease
#1 1 TRUE
#2 2 TRUE
#3 3 FALSE
#4 4 TRUE
#Which exam was positive?
ddply(DF, .(id, exam), summarize, disease = any(value > 4))
# id exam disease
#1 1 AA FALSE
#2 1 BB FALSE
#3 1 CC TRUE
#4 2 AA FALSE
#5 2 BB FALSE
#6 2 CC TRUE
#7 3 AA FALSE
#8 3 BB FALSE
#9 3 CC FALSE
#10 4 AA TRUE
#11 4 BB FALSE
#12 4 CC TRUE