Question

我的df包含一系列具有相似名称的列，每三列分组一次，类似于：

>df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))

 >names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")

> df

AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3

1 0 0 0 0 0 0 0 3 3

2 1 1 1 1 1 1 8 1 1

3 4 6 1 1 1 1 1 1 3

4 5 5 1 1 1 1 9 1 4

这基本上显示了4名患者的每种检查类型（AA，BB，CC）的3种不同测量值（1,2,3）。实际上，我有一个庞大的数据集，其中有3个测量值，可以对2,000名患者进行10次不同的检查我想添加一个新的疾病分类列如下：如果每次检查至少一次测量的分数（XX1，XX2，XX2，其中XX = AA或BB或CC）> 4，则患者患有该疾病。所以新的数据集看起来像这样：

AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3疾病

1 0 0 0 0 0 0 0 3 3 0

2 1 1 1 1 1 1 8 1 1 1

3 4 6 1 1 1 1 1 1 3 1

4 5 5 1 1 1 1 9 1 4 1

Answer 1

df <- structure(list(AA1 = c(0, 1, 4, 5), AA2 = c(0, 1, 3, 3), AA3 = c(0, 
1, 1, 1), BB1 = c(0, 1, 1, 1), BB2 = c(0, 1, 1, 1), BB3 = c(0, 
1, 1, 1), CC1 = c(0, 8, 1, 9), CC2 = c(3, 1, 1, 1), CC3 = c(3, 
1, 3, 4)), .Names = c("AA1", "AA2", "AA3", "BB1", "BB2", "BB3", 
"CC1", "CC2", "CC3"), row.names = c(NA, -4L), class = "data.frame")

indx <- gsub("\\d+","", colnames(df)) #deletes the numberrs from colnames
lst <- split(seq_len(ncol(df)), indx) # seq_len(ncol(df)) #gives the sequence of column and split it by indx

 lapply(lst, function(i) df[,i]) #subset the data by column number

 lapply(lst, function(i) do.call(`pmax`, df[,i])>=4) #gives maximum value for each row and we test if that values is `>=4`

 rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))
 #[1] 0 1 1 2
 !!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4)) #double negation 
 #[1] FALSE  TRUE  TRUE  TRUE
 (!!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))) +0
 #[1] 0 1 1 1

一次性输入上述代码

df$DISEASE <- (!!rowSums(sapply(split(seq_len(ncol(df)),
                  gsub("\\d+","", colnames(df))), function(i)
                    ( do.call(`pmax`, df[,i])>=4))))+0


 df
#   AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3 DISEASE
# 1   0   0   0   0   0   0   0   3   3       0
# 2   1   1   1   1   1   1   8   1   1       1
# 3   4   3   1   1   1   1   1   1   3       1
# 4   5   3   1   1   1   1   9   1   4       1

或者

colnames(df) <- gsub("([A-Za-z]+)(\\d+)", "\\1_\\2", colnames(df)) #created `-` between alphabets and numbers for the colnames
df$id <- 1:nrow(df) #created an id column

df1 <- reshape(df, idvar="id", varying=grep("[A-Z]", colnames(df)), direction="long", sep="_") #used reshape to get the columns starting with `AA` in one column, similarly for `BB` and `CC`

#Split by the reshaped dataset by id and look for any values that are >=4 in each list element
df$DISEASE <- sapply(split(df1[,-(1:2)], df$id), function(x) any(x >=4)) +0 
df[,-10]
#   AA_1 AA_2 AA_3 BB_1 BB_2 BB_3 CC_1 CC_2 CC_3 DISEASE
# 1    0    0    0    0    0    0    0    3    3       0
# 2    1    1    1    1    1    1    8    1    1       1
# 3    4    3    1    1    1    1    1    1    3       1
# 4    5    3    1    1    1    1    9    1    4       1

Answer 2

df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))

names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")

使用您的数据格式的解决方案：

rowSums(df > 4) > 0
#[1]  TRUE  TRUE FALSE  TRUE

这使用了在计算它们的总和时逻辑值被强制转换为0和1的事实。

但整洁的数据通常更可取：

df$id <- rownames(df)
library(reshape2)
DF <- melt(df, id.var="id")
DF$exam <- gsub("[[:digit:]+]", "", DF$variable)
DF$meas <- as.numeric(gsub("[[:alpha:]+]", "", DF$variable))

head(DF)
#  id variable value exam meas
#1  1      AA1     0   AA    1
#2  2      AA1     1   AA    1
#3  3      AA1     4   AA    1
#4  4      AA1     5   AA    1
#5  1      AA2     0   AA    2
#6  2      AA2     1   AA    2


#Is patient diseased?
library(plyr)
ddply(DF, .(id), summarize, disease = any(value > 4))
#  id disease
#1  1    TRUE
#2  2    TRUE
#3  3   FALSE
#4  4    TRUE

#Which exam was positive?
ddply(DF, .(id, exam), summarize, disease = any(value > 4))
#   id exam disease
#1   1   AA   FALSE
#2   1   BB   FALSE
#3   1   CC    TRUE
#4   2   AA   FALSE
#5   2   BB   FALSE
#6   2   CC    TRUE
#7   3   AA   FALSE
#8   3   BB   FALSE
#9   3   CC   FALSE
#10  4   AA    TRUE
#11  4   BB   FALSE
#12  4   CC    TRUE

ifelse格式化多列R的组

2 个答案: