我正在尝试从数据库中过滤一个大表,以便它以自动方式为我工作。
目前的标准是: 按相同的名称/ ID分组 过滤名称中的特定语法 过滤另一个标准 并过滤/关闭没有值的条目(R中的NA)
缺少的唯一两个选项如下 每个ID都有一个特定的日期,一个用于ID的字段和一个用于Passage的字段(P0,P1,...) 如何附加一个新列,首先检查ID,然后是Passage,然后可以取P0的日期,并在两者之间作为输出给我几天?
ID1 ID2 Sample_ID Type Date Passage colonies
abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30
abc-0002-P0 abc-0002 abc-0002-T cells 4/03/16 P0 5
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27
如果我得到abc-0001样本的另一列以及参考相应P0日期的abc-0002样本计算的日期,那么会很棒。
如果可以添加一个函数来创建天数和colonie_count的线图。 - >这不如计算具体日期重要。
这是我目前的代码:
library(readxl)
library(stringr)
library(dplyr
b <- a %>%
select(ID1, ID2, Sample_ID, Type, Date, Passage, Colonies) %>%
group_by(ID2) %>%
filter(str_detect(ID2, 'abc')) %>%
filter(str_detect(ID1, 'cells')) %>%
filter(!is.na(Passage))
write.csv(b, file="test.csv")
如果你可以帮助我会很棒。
最佳,
丹尼斯
添加更多示例是我从我想要的地方开始的:
ID1 ID2 Sample_ID Type Date Passage colonies
abc-0001-T-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23
abc-0001-T abc-0001 abc-0001-T frozen 3/22/16
abc-0001-N abc-0001 abc-0001-N frozen 3/22/16
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30
abc-0002-T-P0 abc-0002 abc-0002-T frozen 4/03/16
abc-0002-T-SFT abc-0002 abc-0002-T frozen 4/03/16
abc-0002-N-SFT abc-0002 abc-0002-N cells 4/03/16 P0 5
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27
我想谈谈这个问题:
ID1 ID2 Sample_ID Type Date Passage colonies days
abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23 0
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30 7
abc-0002-P0 abc-0002 abc-0002-T cells 4/03/16 P0 5 0
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18 32
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27 46
从这张表中我想生成自动单线图 天(x轴)和菌落(y轴)每个图的名称都是Sample_ID。
如果可能的话会很棒。谢谢!
答案 0 :(得分:0)
这会做点什么,
tbl %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(days = colonies- first(colonies),
ID1 = paste0(str_extract(ID1, "^abc.\\d+"), '-', Passage))
#> # A tibble: 5 x 8
#> # Groups: ID2 [2]
#> ID1 ID2 Sample_ID Type Date Passage colonies days
#> <chr> <fctr> <fctr> <fctr> <fctr> <fctr> <int> <int>
#> 1 abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23 0
#> 2 abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30 7
#> 3 abc-0002-P0 abc-0002 abc-0002-N cells 4/03/16 P0 5 0
#> 4 abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18 13
#> 5 abc-0002-P1 abc-0002 abc-00021-T cells 5/19/16 P1 27 22
这是数据,以防其他人想要试一试;
tbl <- structure(list(ID1 = structure(c(4L, 3L, 1L, 2L, 8L, 9L, 5L,
6L, 7L), .Label = c("abc-0001-N", "abc-0001-P1", "abc-0001-T",
"abc-0001-T-P0", "abc-0002-N-SFT", "abc-0002-P1", "abc-0002-P2",
"abc-0002-T-P0", "abc-0002-T-SFT"), class = "factor"), ID2 = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("abc-0001", "abc-0002"
), class = "factor"), Sample_ID = structure(c(2L, 2L, 1L, 2L,
4L, 4L, 3L, 5L, 5L), .Label = c("abc-0001-N", "abc-0001-T", "abc-0002-N",
"abc-0002-T", "abc-00021-T"), class = "factor"), Type = structure(c(1L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("cells", "frozen"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 2L, 3L,
3L, 3L, 4L, 5L), .Label = c("3/22/16", "3/29/16", "4/03/16",
"5/05/16", "5/19/16"), class = "factor"), Passage = structure(c(1L,
NA, NA, 2L, NA, NA, 1L, 2L, 2L), .Label = c("P0", "P1"), class = "factor"),
colonies = c(23L, NA, NA, 30L, NA, NA, 5L, 18L, 27L)), .Names = c("ID1",
"ID2", "Sample_ID", "Type", "Date", "Passage", "colonies"), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
这是关于实际生产数据的代码,
a %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(colonies_num = as.numeric(colonies),
days = colonies_num- first(colonies_num),
ID = paste0(str_extract(ID, "^abc.\\d+"), '-', Passage)) %>%
select(-colonies_num)
#> # A tibble: 9 x 20
#> # Groups: ID2 [2]
#> ID Derived_Sample_ID ID2 Sample_ID `Derived_Sample_Type?`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 NA-P0 abc-0001-T-p0 abc-0001 abc-0001-T cells
#> 2 NA-P0 abc-0002-T-p0 abc-0002 abc-0002-T cells
#> 3 NA-P1 abc-0002-T-p1 abc-0002 abc-0002-T cells
#> ...
或更好,
a %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(days = as.numeric(colonies)- first(as.numeric(colonies)),
ID = paste0(str_extract(ID, "^abc.\\d+"), '-', Passage))
答案 1 :(得分:0)
无法将其发布为评论,因为它太长了 只更改我所做的是将日期列转换为
的日期a$`Date_Sample_Created?` <- as.Date(a$`Date_Sample_Created?`, format="%Y-%m-%d")
希望得到这个帮助。
structure(list(ID = c("36", "37", "38", "45", "46", "47", "48",
"57", "59", "121", "131", "132", "134", "206"), Derived_Sample_ID = c("abc-0001-T-p0",
"abc-0001-T-SFT", "abc-0001-N-SFT", "abc-0002-T-p0", "abc-0002-T-SFT",
"abc-0002-N-SFT", "abc-0002-T-p1", "abc-0002-T-p2", "abc-0001-T-CPT",
"abc-0001-T-p1", "abc-0001-T-p2", "abc-0002-T-p3", "abc-0001-T-p3",
"abc-0002-T-P4"), ID2 = c("abc-0001", "abc-0001", "abc-0001",
"abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0001",
"abc-0001", "abc-0001", "abc-0002", "abc-0001", "abc-0002"),
Sample_ID = c("abc-0001-T", "abc-0001-T", "abc-0001-N", "abc-0002-T",
"abc-0002-T", "abc-0002-N", "abc-0002-T", "abc-0002-T", "abc-0001-T",
"abc-0001-T", "abc-0001-T", "abc-0002-T", "abc-0001-T", "abc-0002-T"
), `Derived_Sample_Type?` = c("cells", "Frozen tissue", "Frozen tissue",
"cells", "Frozen tissue", "Frozen tissue", "cells", "cells",
"Frozen tissue", "cells", "cells", "cells", "cells", "cells"
), `DNA_concentration?` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), `DNA_concentration?__1` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Passage_#` = c("P0",
NA, NA, "P0", NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3",
"P4"), `Culture_Plate_Type?` = c("24-well plate", NA, NA,
"24-well plate", NA, NA, "24-well plate", "24-well plate",
NA, "24-well plate", "24-well plate", "24-well plate", "24-well plate",
"6-well plate"), `colonies_#` = c("23", NA, NA, "12", NA,
NA, "10", "8", NA, "23", "24", "14", "6", "21"), `Split_Ratio?` = c("NA",
NA, NA, "NA", NA, NA, "1:1", "1:1", NA, "1:1", "1:1", "1:2",
NA, "NA"), `DNA_concentration?__2` = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), `DNA_concentration?__3` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Date_Sample_Created?` = c("2017-07-26",
"2017-07-26", "2017-07-26", "2017-07-28", "2017-07-28", "2017-07-28",
"2017-07-31", "2017-08-11", "2017-07-26", "2017-08-21", "2017-08-22",
"2017-08-22", "2017-09-01", "2017-09-06"), `Time_Sample_Created?` = c("6:30PM",
"5:00PM", "5:00PM", "1:00PM", "11:00AM", "11:00AM", "04:30 PM",
"04:00 PM", "5:00PM", "10:00AM", "05:00 PM", "05:00 PM",
"01:30 PM", "4:00 PM"), `SOP_used?` = c("Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of Lung Tumor cellss - #6", "Culture of Lung Tumor cellss - #6",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5"), `Any_changes_to_SOP?` = c("Yes",
"No", "No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes",
"Yes", "Yes", "Yes", "Yes"), Passage = c("P0", NA, NA, "P0",
NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3", "P4"), colonies = c("23",
NA, NA, "12", NA, NA, "10", "8", NA, "23", "24", "14", "6",
"21")), .Names = c("ID", "Derived_Sample_ID", "ID2", "Sample_ID",
"Derived_Sample_Type?", "DNA_concentration?", "DNA_concentration?__1",
"Passage_#", "Culture_Plate_Type?", "colonies_#", "Split_Ratio?",
"DNA_concentration?__2", "DNA_concentration?__3", "Date_Sample_Created?",
"Time_Sample_Created?", "SOP_used?", "Any_changes_to_SOP?", "Passage",
"colonies"), row.names = c(NA, -14L), class = c("tbl_df", "tbl",
"data.frame"))
非常感谢!!!