我有这样的数据
df <- structure(list(Division = structure(c(1L, 1L, 1L, 2L, 2L, 3L), .Label = c("Main data",
"Second data", "Third data"), class = "factor"), Gene = structure(1:6, .Label = c("ABI3BP",
"ADIPOQ", "AEBP1", "AGRN", "AMBN", "AMELX"), class = "factor"),
IDs = c(17265L, 13633L, 303L, 329L, 452L, 461L), IDs.Links = c(17265L,
13633L, 303L, 329L, 452L, 461L), UniID = structure(c(1L,
4L, 2L, 3L, 6L, 5L), .Label = c("B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897",
"C9JLQ8:H7C0W8:H7C1J5", "H0Y5U1:O00468", "Q15848", "Q99217",
"Q9NP70"), class = "factor"), Refseq_IDs = structure(c(4L,
3L, 1L, 6L, 5L, 2L), .Label = c("NP_001120.3", "NP_001133.1:NP_872621.1:NP_872622.1",
"NP_001171271.1:NP_004788.1", "NP_056244.2:XP_005247340.1",
"NP_057603.1", "NP_940978.2:XP_005244806.1:XP_006710696.1"
), class = "factor"), Orthology = structure(1:6, .Label = c("Mouse:Abi3bp|",
"Mouse:Adipoq|", "Mouse:Aebp1|", "Mouse:Agrn|", "Mouse:Ambn|",
"Mouse:Amelx|"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
在名为UniID的一列中,我有许多用分隔的字符串:我想将它们中的每一个放入新行并重复其他列
愿望输出如下所示
df2 <-structure(list(Division = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 3L), .Label = c("Main data", "Second data",
"Third data"), class = "factor"), Gene = structure(c(1L, 1L,
1L, 1L, 1L, 2L, 3L, 4L, 5L, 6L, 6L, 7L, 8L), .Label = c("ABI3BP",
"ADIPOQ", "AEBP1", "AEBP2", "AEBP3", "AGRN", "AMBN", "AMELX"), class = "factor"),
IDs = c(17265L, 17265L, 17265L, 17265L, 17265L, 13633L, 303L,
303L, 303L, 329L, 329L, 452L, 461L), IDs.Links = c(17265L,
17265L, 17265L, 17265L, 17265L, 13633L, 303L, 303L, 303L,
329L, 329L, 452L, 461L), UniID = structure(c(1L, 3L, 4L,
5L, 7L, 11L, 2L, 8L, 9L, 6L, 10L, 13L, 12L), .Label = c("B4DSV9",
"C9JLQ8", "D3YTG3", "E9PPR9", "E9PRB5", "H0Y5U1", "H0Y897",
"H7C0W8", "H7C1J5", "O00468", "Q15848", "Q99217", "Q9NP70"
), class = "factor"), Refseq_IDs = structure(c(4L, 4L, 4L,
4L, 4L, 3L, 1L, 1L, 1L, 6L, 7L, 5L, 2L), .Label = c("NP_001120.3",
"NP_001133.1:NP_872621.1:NP_872622.1", "NP_001171271.1:NP_004788.1",
"NP_056244.2:XP_005247340.1", "NP_057603.1", "NP_940978.2:XP_005244806.1:XP_006710696.1",
"NP_940978.2:XP_005244806.1:XP_006710696.2"), class = "factor"),
Orthology = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L,
4L, 4L, 5L, 6L), .Label = c("Mouse:Abi3bp|", "Mouse:Adipoq|",
"Mouse:Aebp1|", "Mouse:Agrn|", "Mouse:Ambn|", "Mouse:Amelx|"
), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
我找到了其他帖子,我尝试这样做,但是没有成功。
s <- strsplit(as.character(df$UniID), ':')
mydf<-data.frame(director=unlist(s), IDs=rep(df$IDs, lengths(s)))
只给我ID和UniID列
mydf<- df[, lapply(.SD, function(x) unlist(tstrsplit(x, ":", fixed=TRUE))), by = IDs][!is.na(UniID)]
Error in `[.data.frame`(df, , lapply(.SD, function(x) unlist(tstrsplit(x, :
unused argument (by = IDs)
这一个
mydf<- df[, strsplit(as.character(UniID), ":", fixed=TRUE),
+ by = .(IDs, UniID)][,.(UniID = V1, IDs)]
Error in `[.data.frame`(df, , strsplit(as.character(UniID), ":", fixed = TRUE), :
unused argument (by = .(IDs, UniID))
答案 0 :(得分:1)
一种dplyr
的可能性:
df %>%
mutate(UniID = strsplit(as.character(UniID), ":")) %>%
unnest()
Division Gene IDs IDs.Links Refseq_IDs Orthology UniID
1 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| B4DSV9
2 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| D3YTG3
3 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| E9PPR9
4 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| E9PRB5
5 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| H0Y897
6 Main data ADIPOQ 13633 13633 NP_001171271.1:NP_004788.1 Mouse:Adipoq| Q15848
7 Main data AEBP1 303 303 NP_001120.3 Mouse:Aebp1| C9JLQ8
8 Main data AEBP1 303 303 NP_001120.3 Mouse:Aebp1| H7C0W8
9 Main data AEBP1 303 303 NP_001120.3 Mouse:Aebp1| H7C1J5
10 Second data AGRN 329 329 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn| H0Y5U1
11 Second data AGRN 329 329 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn| O00468
12 Second data AMBN 452 452 NP_057603.1 Mouse:Ambn| Q9NP70
13 Third data AMELX 461 461 NP_001133.1:NP_872621.1:NP_872622.1 Mouse:Amelx| Q99217
此处将:
上的“ UniID”列拆分出来,然后对其进行嵌套。
答案 1 :(得分:1)
简单-
> library(splitstackshape)
> cSplit(df, "UniID", ":", "long")
Division Gene IDs IDs.Links UniID Refseq_IDs Orthology
1: Main data ABI3BP 17265 17265 B4DSV9 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
2: Main data ABI3BP 17265 17265 D3YTG3 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
3: Main data ABI3BP 17265 17265 E9PPR9 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
4: Main data ABI3BP 17265 17265 E9PRB5 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
5: Main data ABI3BP 17265 17265 H0Y897 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
6: Main data ADIPOQ 13633 13633 Q15848 NP_001171271.1:NP_004788.1 Mouse:Adipoq|
7: Main data AEBP1 303 303 C9JLQ8 NP_001120.3 Mouse:Aebp1|
8: Main data AEBP1 303 303 H7C0W8 NP_001120.3 Mouse:Aebp1|
9: Main data AEBP1 303 303 H7C1J5 NP_001120.3 Mouse:Aebp1|
10: Second data AGRN 329 329 H0Y5U1 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn|
11: Second data AGRN 329 329 O00468 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn|
12: Second data AMBN 452 452 Q9NP70 NP_057603.1 Mouse:Ambn|
13: Third data AMELX 461 461 Q99217 NP_001133.1:NP_872621.1:NP_872622.1 Mouse:Amelx|