Excel或R:重塑数据

时间:2017-04-21 09:02:14

标签: r excel data-manipulation

我有一些看起来像这样的数据:

ID Data

1  a b c 
2  a b c a b c
3  a b c 
4  a b c a b c a b c a b c
5  a b c a b c a b c

我想在下面的

ID Data

1 a b c
2 a b c
2 a b c
3 a b c
4 a b c
4 a b c
4 a b c
4 a b c
5 a b c
5 a b c
5 a b c

abc位于不同的列单元格中,因此Data实际上是多列。如果需要,我可以连接它们。

基本上a bc是相关的,但对于某些ID,我有多个结果,我希望数据采用长格式而不是宽格式但保留ID对于每一行。

我能做到的就是R,如果这也更容易。

Dput:

structure(list(ID = c(9999812L, 999908L, 9993595L, 9992905L, 
9989664L, 9984487L, 9980956L, 9980112L, 9980091L, 9979915L, 9979613L, 
9979400L, 9978215L, 9976882L, 9975335L, 9974511L, 9973804L, 9973025L
), a = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "a", class = "factor"), 
    b = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "b", class = "factor"), 
    c = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "c", class = "factor"), 
    a.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "a"), class = "factor"), 
    b.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "b"), class = "factor"), 
    c.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "c"), class = "factor"), 
    a.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"), 
    b.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"), 
    c.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"), 
    a.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"), 
    b.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"), 
    c.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"), 
    a.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"), 
    b.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"), 
    c.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor")), .Names = c("ID", 
"a", "b", "c", "a.1", "b.1", "c.1", "a.2", "b.2", "c.2", "a.3", 
"b.3", "c.3", "a.4", "b.4", "c.4"), class = "data.frame", row.names = c(NA, 
-18L))

1 个答案:

答案 0 :(得分:1)

使用您的初始数据,您可以使用" stringr"和" reshape2"融化数据。

dt <- data.frame(x = 1:5, y = c( "a b c" , "a b c a b c","a b c","a b c a b c a b c a b c", "a b c a b c a b c"))

library("stringr")
library("reshape2")
maxlen <- max(lengths(str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")))

list_lists <- str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")

li <- lapply(list_lists, `length<-`,maxlen)

dtnew <- data.frame(x =cbind(dt$x),do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),]

   > dtnew1[order(dtnew1$x),c(1,3)]
   x value
1  1 a b c
2  2 a b c
7  2 a b c
3  3 a b c
4  4 a b c
9  4 a b c
14 4 a b c
19 4 a b c
5  5 a b c
10 5 a b c
15 5 a b c
> 

编辑:对于更新的数据,请创建一个名为&#34; concat&#34;的字段,它是列的连接值&#34; a&#34;到&#34; c.4&#34;

您可以使用:concat <- data.frame(concat=do.call("paste0",dt[,2:length(dt)]))来连接字段

然后分配dt$concat <- concat

library("stringr")
library("reshape2")

maxlen <- max(lengths(str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")))

list_lists <- str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")

li <- lapply(list_lists, `length<-`,maxlen)

dtnew <- data.frame(x =cbind(dt$ID),y=do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),c(1,3)]

> dtnew1[order(dtnew1$x),c(1,3)]
         x value
2   999908   abc
18 9973025   abc
36 9973025   abc
54 9973025   abc
72 9973025   abc
90 9973025   abc
17 9973804   abc
35 9973804   abc
16 9974511   abc
15 9975335   abc
33 9975335   abc
51 9975335   abc
69 9975335   abc
14 9976882   abc
13 9978215   abc
12 9979400   abc
30 9979400   abc
48 9979400   abc
11 9979613   abc
10 9979915   abc
9  9980091   abc
8  9980112   abc
7  9980956   abc
6  9984487   abc
24 9984487   abc
5  9989664   abc
4  9992905   abc
3  9993595   abc
1  9999812   abc
19 9999812   abc
>