我有一些看起来像这样的数据:
ID Data
1 a b c
2 a b c a b c
3 a b c
4 a b c a b c a b c a b c
5 a b c a b c a b c
我想在下面的
ID Data
1 a b c
2 a b c
2 a b c
3 a b c
4 a b c
4 a b c
4 a b c
4 a b c
5 a b c
5 a b c
5 a b c
a
,b
和c
位于不同的列单元格中,因此Data
实际上是多列。如果需要,我可以连接它们。
基本上a
b
和c
是相关的,但对于某些ID,我有多个结果,我希望数据采用长格式而不是宽格式但保留ID对于每一行。
我能做到的就是R,如果这也更容易。
Dput:
structure(list(ID = c(9999812L, 999908L, 9993595L, 9992905L,
9989664L, 9984487L, 9980956L, 9980112L, 9980091L, 9979915L, 9979613L,
9979400L, 9978215L, 9976882L, 9975335L, 9974511L, 9973804L, 9973025L
), a = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "a", class = "factor"),
b = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "b", class = "factor"),
c = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "c", class = "factor"),
a.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "a"), class = "factor"),
b.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "b"), class = "factor"),
c.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "c"), class = "factor"),
a.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"),
b.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"),
c.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"),
a.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"),
b.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"),
c.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"),
a.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"),
b.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"),
c.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor")), .Names = c("ID",
"a", "b", "c", "a.1", "b.1", "c.1", "a.2", "b.2", "c.2", "a.3",
"b.3", "c.3", "a.4", "b.4", "c.4"), class = "data.frame", row.names = c(NA,
-18L))
答案 0 :(得分:1)
使用您的初始数据,您可以使用" stringr"和" reshape2"融化数据。
dt <- data.frame(x = 1:5, y = c( "a b c" , "a b c a b c","a b c","a b c a b c a b c a b c", "a b c a b c a b c"))
library("stringr")
library("reshape2")
maxlen <- max(lengths(str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")))
list_lists <- str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")
li <- lapply(list_lists, `length<-`,maxlen)
dtnew <- data.frame(x =cbind(dt$x),do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),]
> dtnew1[order(dtnew1$x),c(1,3)]
x value
1 1 a b c
2 2 a b c
7 2 a b c
3 3 a b c
4 4 a b c
9 4 a b c
14 4 a b c
19 4 a b c
5 5 a b c
10 5 a b c
15 5 a b c
>
编辑:对于更新的数据,请创建一个名为&#34; concat
&#34;的字段,它是列的连接值&#34; a&#34;到&#34; c.4&#34;
您可以使用:concat <- data.frame(concat=do.call("paste0",dt[,2:length(dt)]))
来连接字段
然后分配dt$concat <- concat
library("stringr")
library("reshape2")
maxlen <- max(lengths(str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")))
list_lists <- str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")
li <- lapply(list_lists, `length<-`,maxlen)
dtnew <- data.frame(x =cbind(dt$ID),y=do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),c(1,3)]
> dtnew1[order(dtnew1$x),c(1,3)]
x value
2 999908 abc
18 9973025 abc
36 9973025 abc
54 9973025 abc
72 9973025 abc
90 9973025 abc
17 9973804 abc
35 9973804 abc
16 9974511 abc
15 9975335 abc
33 9975335 abc
51 9975335 abc
69 9975335 abc
14 9976882 abc
13 9978215 abc
12 9979400 abc
30 9979400 abc
48 9979400 abc
11 9979613 abc
10 9979915 abc
9 9980091 abc
8 9980112 abc
7 9980956 abc
6 9984487 abc
24 9984487 abc
5 9989664 abc
4 9992905 abc
3 9993595 abc
1 9999812 abc
19 9999812 abc
>