R:重构和扩展R中的数据帧

时间:2014-09-02 23:54:04

标签: r dataframe reshape

我在R中的示例数据框看起来如下:

     NAME  ID SURVEY_YEAR REFERENCE_YEAR CUMULATIVE_SUM CUMULATIVE_SUM_REFYEAR
1 NAME1  47        1960           1959             -6                      0
2 NAME1  47        1961           1960            -10                     -6
3 NAME1  47        1963           1961             NA                     NA
4 NAME1  47        1965           1963            -23                    -10
5 NAME2 259        2007           2004             -9                      0
6 NAME2 259        2009           2007             NA                     NA
7 NAME2 259        2010           2009             NA                     NA
8 NAME2 259        2011           2010             NA                     NA
9 NAME2 259        2014           2011            -40                     -9

我要做的是按以下方式重新排列数据框。

NAME    ID  time            YEAR    CUMULATIVE_SUM  id
NAME1   47  REFERENCE_YEAR  1959    0               1
NAME1   47  SURVEY_YEAR     1960    -6              1
NAME1   47  REFERENCE_YEAR  1960    -6              2
NAME1   47  SURVEY_YEAR     1961    -10             2
NAME1   47  REFERENCE_YEAR  1961    NA              3
NAME1   47  SURVEY_YEAR     1963    NA              3
NAME1   47  REFERENCE_YEAR  1963    -10             4
NAME1   47  SURVEY_YEAR     1965    -23             4
NAME2   259 REFERENCE_YEAR  2004    0               5
NAME2   259 SURVEY_YEAR     2007    -9              5
NAME2   259 REFERENCE_YEAR  2007    NA              6
NAME2   259 SURVEY_YEAR     2009    NA              6
NAME2   259 REFERENCE_YEAR  2009    NA              7
NAME2   259 SURVEY_YEAR     2010    NA              7
NAME2   259 REFERENCE_YEAR  2010    NA              8
NAME2   259 SURVEY_YEAR     2011    NA              8
NAME2   259 REFERENCE_YEAR  2011    -9              9
NAME2   259 SURVEY_YEAR     2014    -40             9    

我试图通过以下方式做到这一点,但这不是我想要的。

# read in data
data <- data.frame(NAME=c("NAME1", "NAME1","NAME1","NAME1","NAME2","NAME2","NAME2","NAME2","NAME2"),
                   ID=c(47,47,47,47,259,259,259,259,259),
                   SURVEY_YEAR=c(1960,1961,1963,1965,2007,2009,2010,2011,2014), 
                   REFERENCE_YEAR=c(1959,1960,1961,1963,2004,2007,2009,2010,2011),
                   CUMULATIVE_SUM=c(-6,-10,NA,-23,-9,NA,NA,NA,-40),
                   CUMULATIVE_SUM_REFYEAR=c(0,-6,NA,-10,0,NA,NA,NA,-9))  

# reshape data  
dat3 <- reshape(data, direction="long",
                    varying = list(c("SURVEY_YEAR","REFERENCE_YEAR"), c("CUMULATIVE_SUM","CUMULATIVE_SUM_REFYEAR")), 
                    v.names=c("YEAR","CUMULATIVE_SUM"), times = c("SURVEY_YEAR", "REFERENCE_YEAR"))

有没有人知道如何正确地重塑数据帧,以便它对应于上面所需的解决方案?感谢。

4 个答案:

答案 0 :(得分:1)

只需重新排序您已有的数据

dat3 <- dat3[with(dat3, order(NAME, ID, YEAR, id)), ]

或者如果你想要高性能

library(data.table)
setkey(setDT(dat3), NAME, ID, YEAR, id)

答案 1 :(得分:0)

这样的事情应该有效

data["id"]<- c(1,2,3,4,5,6,7,8,9)

sursub<-data # make subset
refsub<-data # make subset

sursub["time"]<-"SURVEY_YEAR" #add column 
sursub["REFERENCE_YEAR"]<- 0 #remove reference_year

refsub["time"]<-  "REFERENCE_YEAR" #add column
refsub["SURVEY_YEAR"]<- 0 #remove survey_year

merge<-rbind(sursub, refsub) #merge two subsets back together

sorted<- merge[with(merge, order(id)), ] #sort based on id column
sorted["year"]<-sorted[,3]+sorted[,4] #make year column with the observation year

这只是删除不需要的列的问题。

答案 2 :(得分:0)

或者您可以使用dplyr

 library(dplyr)
 library(tidyr) 
 data%>% 
 gather(time, YEAR, 3:4)%>% 
 mutate(CUMULATIVE_SUM=ifelse(time=="SURVEY_YEAR", CUMULATIVE_SUM, CUMULATIVE_SUM_REFYEAR)) %>%
 arrange(NAME, ID, YEAR, row_number()) %>%
 select(c(1:2, 5:6,3)) %>%
 head(4)
 #      NAME ID           time YEAR CUMULATIVE_SUM
 #1 NAME1 47 REFERENCE_YEAR 1959              0
 #2 NAME1 47    SURVEY_YEAR 1960             -6
 #3 NAME1 47 REFERENCE_YEAR 1960             -6
 #4 NAME1 47    SURVEY_YEAR 1961            -10

答案 3 :(得分:0)

尝试:

ddf2 = data.frame(NAME=character(), ID=numeric(), time=character(), YEAR=numeric(), CUMULATIVE_SUM=numeric(), id=numeric(), stringsAsFactors=F)
for (r in 1:nrow(data)){
    ddf2[nrow(ddf2)+1,]=c(as.character(data[r,1]), data[r,2], "REFERENCE_YEAR", data[r,4], data[r,6], rownames(data)[r] )
    ddf2[nrow(ddf2)+1,]=c(as.character(data[r,1]), data[r,2], "SURVEY_YEAR", data[r,3], data[r,5], rownames(data)[r] )
}

ddf2
    NAME  ID           time YEAR CUMULATIVE_SUM id
1  NAME1  47 REFERENCE_YEAR 1959              0  1
2  NAME1  47    SURVEY_YEAR 1960             -6  1
3  NAME1  47 REFERENCE_YEAR 1960             -6  2
4  NAME1  47    SURVEY_YEAR 1961            -10  2
5  NAME1  47 REFERENCE_YEAR 1961           <NA>  3
6  NAME1  47    SURVEY_YEAR 1963           <NA>  3
7  NAME1  47 REFERENCE_YEAR 1963            -10  4
8  NAME1  47    SURVEY_YEAR 1965            -23  4
9  NAME2 259 REFERENCE_YEAR 2004              0  5
10 NAME2 259    SURVEY_YEAR 2007             -9  5
11 NAME2 259 REFERENCE_YEAR 2007           <NA>  6
12 NAME2 259    SURVEY_YEAR 2009           <NA>  6
13 NAME2 259 REFERENCE_YEAR 2009           <NA>  7
14 NAME2 259    SURVEY_YEAR 2010           <NA>  7
15 NAME2 259 REFERENCE_YEAR 2010           <NA>  8
16 NAME2 259    SURVEY_YEAR 2011           <NA>  8
17 NAME2 259 REFERENCE_YEAR 2011             -9  9
18 NAME2 259    SURVEY_YEAR 2014            -40  9