当任何一列包含1,否则如何使用dplyr对该列进行修改,否则该值将为0

时间:2019-04-06 01:25:25

标签: r dplyr data.table

events <- structure(list(ID = c(3049951, 3085397, 3204081, 3262134, 
3467254), TVTProcedureStartDate = structure(c(16210, 16238, 16322, 
16420, 16546), class = "Date"), DCDate = structure(c(16213, 16250, 
16326, 16426, 16560), class = "Date"), CE_EventOccurred = c(0L, 
0L, 0L, 0L, 0L), CE_EventDate = c(0L, 0L, 0L, 0L, 0L), `Annular Dissection (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Aortic Dissection (In Hospital)` = c(0L, 0L, 
0L, 1L, 0L), `Atrial Fibrillation (In Hospital)` = c(0L, 1L, 
0L, 0L, 1L), `Bleeding at Access Site (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Cardiac Arrest (In Hospital)` = c(1L, 0L, 0L, 
0L, 0L), `Conduction/Native Pacer Disturbance Req ICD (In Hospital)` = c(0L, 
0L, 1L, 0L, 0L), `Conduction/Native Pacer Disturbance Req Pacer (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Endocarditis (In Hospital)` = c(0L, 0L, 0L, 
0L, 0L), `GI Bleed (In Hospital)` = c(0L, 0L, 0L, 0L, 0L), `Hematoma at Access Site (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Ischemic Stroke (In Hospital)` = c(0L, 0L, 
0L, 0L, 0L), `Major Vascular Complications (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Minor Vascular Complication (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Mitral Leaflet Injury - detected during surgery (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Mitral Subvalvular Injury -detected during surgery (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `New Requirement for Dialysis (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Other Bleed (In Hospital)` = c(0L, 0L, 0L, 
0L, 0L), `Perforation with or w/o Tamponade (In Hospital)` = c(1L, 
0L, 0L, 0L, 0L), `Retroperitoneal Bleeding (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Single Leaflet Device Attachment (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Unplanned Other Cardiac Surgery or Intervention (In Hospital)` = c(0L, 
0L, 0L, 0L, 0L), `Unplanned Vascular Surgery or Intervention (In Hospital)` = c(0L, 
0L, 0L, 1L, 0L)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), vars = "NCDRPatientID", labels = structure(list(
    NCDRPatientID = c(3049951, 3085397, 3204081, 3262134, 3467254
    )), class = "data.frame", row.names = c(NA, -5L), vars = "NCDRPatientID", labels = structure(list(
    NCDRPatientID = c(3049951, 3085397, 3204081, 3262134, 3467254, 
    3467324, 3510387, 3586037, 3661089, 3668621, 3679485, 3737916, 
    3738064, 3960141, 4006862, 4018241, 4019056, 4025174, 4027490, 
    4050900, 4051101, 4096816, 4097119, 4097146, 4097180, 4098426, 
    4106410, 4109968, 4147466, 4198427, 4198450, 4198458, 4204554, 
    4208053, 4213116, 4218802, 4218854, 4223378, 4223415, 4243959, 
    4316979, 4341660, 4348676, 4413567, 4419513, 4421948, 4422768, 
    4426483, 4430159, 4431211, 4433156, 4433406, 4433988)), class = "data.frame", row.names = c(NA, 
-53L), vars = "NCDRPatientID", labels = structure(list(NCDRPatientID = c(3049951, 
3085397, 3204081, 3262134, 3467254, 3467324, 3510387, 3586037, 
3661089, 3668621, 3679485, 3737916, 3738064, 3960141, 4006862, 
4018241, 4019056, 4025174, 4027490, 4050900, 4051101, 4096816, 
4097119, 4097146, 4097180, 4098426, 4106410, 4109968, 4147466, 
4198427, 4198450, 4198458, 4204554, 4208053, 4213116, 4218802, 
4218854, 4223378, 4223415, 4243959, 4316979, 4341660, 4348676, 
4413567, 4419513, 4421948, 4422768, 4426483, 4430159, 4431211, 
4433156, 4433406, 4433988)), class = "data.frame", row.names = c(NA, 
-53L), vars = "NCDRPatientID", drop = TRUE), indices = list(0L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10:12, 13L, 14L, 15L, 
    16:17, 18L, 19:21, 22L, 23L, 24L, 25:26, 27L, 28L, 29:30, 
    31L, 32:33, 34L, 35:38, 39L, 40:41, 42L, 43L, 44L, 45L, 46L, 
    47L, 48:50, 51:53, 54L, 55L, 56L, 57L, 58L, 59:60, 61L, 62L, 
    63:64, 65:66, 67:68, 69L, 70L, 71:72, 73L), drop = TRUE, group_sizes = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 
1L, 1L, 2L, 1L), biggest_group_size = 4L), indices = list(0L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
    27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
    39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 
    51L, 52L), drop = TRUE, group_sizes = c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), biggest_group_size = 1L), indices = list(0L, 1L, 2L, 3L, 4L), drop = TRUE, group_sizes = c(1L, 
1L, 1L, 1L, 1L), biggest_group_size = 1L)

根据这些数据,如果以(in-hospital)结尾的任何列包含1个其他0,则需要创建一个值为1的列。

我尝试了多种操作,但不起作用或显示错误

Error in mutate_impl(.data, dots) : Evaluation error: NA/NaN argument.
event %>% mutate(TR = rowSums(select_(.,6:n)))

Error in mutate_impl(.data, dots) : Column `TR` must be length 1 (the group size), not 53
event %>% mutate(TR = rowSums(.[6:ncol(.)]))

它的一些其他变体,看我是否能够理解或理解,但它不断遇到类似的错误和问题

我尝试的另一件事是以下内容,它似乎可以对行求和,但即使执行以下操作,它也会添加ID:

event %>% select(6:27) %>% rowSums()

,但是它为每行从10列中添加了627的ID。不知道为什么要这么做。

我希望结果作为具有相同数据的数据帧,但是如果从6到27的任何列中包含1,否则也将是1s的列

3 个答案:

答案 0 :(得分:2)

这不是dplyr方式,但也可以:

events$new_col <- 0
events$new_col[rowSums(events[, grep("In Hospital", colnames(events))]) >= 1] <- 1

答案 1 :(得分:2)

在开发解决方案之前,我先运行以下代码对数据进行分组。

library(dplyr)

events <- events %>% ungroup()

解决方案1:具有选定列的rowSums

此解决方案的想法是使用rowSums将所选列中的所有数字相加,确定总和是否大于0,然后将逻辑向量转换为整数向量(具有1或0 )。

有许多选择列的方法。我们可以根据列号进行选择。

events2 <- events %>% mutate(Col = as.integer(rowSums(select(., 6:27)) > 0))
events2$Col 
# [1] 1 1 1 1 1

我们可以使用ends_with

events2 <- events %>% mutate(Col = as.integer(rowSums(select(., ends_with("(In Hospital)"))) > 0))
events2$Col 
# [1] 1 1 1 1 1

我们可以使用matches。正则表达式\\(In Hospital\\)$表示末尾的字符串。

events2 <- events %>% mutate(Col = as.integer(rowSums(select(., matches("\\(In Hospital\\)$"))) > 0))
events2$Col 
# [1] 1 1 1 1 1

我们可以使用contains,但是请注意,目标字符串不必在列名的末尾。

events2 <- events %>% mutate(Col = as.integer(rowSums(select(., contains("(In Hospital)"))) > 0))
events2$Col 
# [1] 1 1 1 1 1

解决方案2:最多使用

由于目标列中的数字均为10,因此我们可以将applymax一起使用以获取最大值,即{{1} },如果有11。上面显示的使用0函数的所有方式在这里也适用。下面,我介绍了一种方法。

select

答案 2 :(得分:2)

使用R的基础apply()的解决方案

cols <- grep("in hospital", colnames(events), ignore.case = T)
apply(events[, cols], 1, function(x) ifelse(any(x == 1), 1, 0))

# [1] 1 1 1 1 1