将R数据帧拆分为多行

时间:2014-12-02 12:53:28

标签: r split dataframe

我正在寻找一种将数据帧分成几行的方法。

我的测试输入数据如下所示

   data <- read.table(text ="group;  yr1; yr2; val; col2
    a;  1927;   1934;   -140; coltest
    a;  1953;   1955;   -480; coltest
    b;  1957;   1958;   -280; coltest1
    b;  1961;   1965;   -1420; coltest1 ", sep=";", header=T,stringsAsFactors = FALSE)

我正在寻找的是一种计算每年价值并将其写成这样一行的方法:

group; yr1;    yr2;   val;   col2
    a; 1927; 1928;    -20;   coltest 
    a; 1928; 1929;    -20;   coltest
    a; 1929; 1930;    -20;   coltest
    a; 1930; 1931;    -20;   coltest
    a; 1931; 1932;    -20;   coltest
    a; 1932; 1933;    -20;   coltest
    a; 1933; 1934;    -20;   coltest
    a; 1953; 1954;   -240;   coltest
    a; 1954; 1955;   -240;   coltest
    b; 1957; 1958;   -280;   coltest1
    b; 1961; 1962;   -355;   coltest1
    b; 1962; 1963;   -355;   coltest1
    b; 1963; 1964;   -355;   coltest1
    b; 1964; 1965;   -355;   coltest1

我可以像这样计算一年的每个值,但无法将其拆分为单独的行。

data$new <- data$val/(data$yr2-data$yr1)

3 个答案:

答案 0 :(得分:5)

library(data.table)
setDT(data)
data[,SNO := .I]
data[,val := val / (yr2 - yr1)]
(data[,
     list(yr = yr1:(yr2-1), val),
     by = list(group,SNO)
     ][,
       SNO := NULL
       ][,
         yr2 := yr + 1]

)

输出

#     group   yr  val  yr2
#  1:     a 1927  -20 1928
#  2:     a 1928  -20 1929
#  3:     a 1929  -20 1930
#  4:     a 1930  -20 1931
#  5:     a 1931  -20 1932
#  6:     a 1932  -20 1933
#  7:     a 1933  -20 1934
#  8:     a 1953 -240 1954
#  9:     a 1954 -240 1955
# 10:     b 1957 -280 1958
# 11:     b 1961 -355 1962
# 12:     b 1962 -355 1963
# 13:     b 1963 -355 1964
# 14:     b 1964 -355 1965

答案 1 :(得分:3)

可以使用我的“splitstackshape”包中的expandRows以及带有“data.table”的一些复合语句:

library(splitstackshape)
expandRows(
  as.data.table(
    data, keep.rownames = TRUE)[, diff := yr2 - yr1][, 
      val := val/diff], "diff")[, yr1 := yr1 + sequence(.N) - 1L, 
        by = list(group, rn)][, yr2 := yr1 + 1][]
#     rn group  yr1  yr2  val
#  1:  1     a 1927 1928  -20
#  2:  1     a 1928 1929  -20
#  3:  1     a 1929 1930  -20
#  4:  1     a 1930 1931  -20
#  5:  1     a 1931 1932  -20
#  6:  1     a 1932 1933  -20
#  7:  1     a 1933 1934  -20
#  8:  2     a 1953 1954 -240
#  9:  2     a 1954 1955 -240
# 10:  3     b 1957 1958 -280
# 11:  4     b 1961 1962 -355
# 12:  4     b 1962 1963 -355
# 13:  4     b 1963 1964 -355
# 14:  4     b 1964 1965 -355

与@ beginneR的方法相比,这更有效,但纯粹的“data.table”方法更快。

以下是仅1000行的比较:

功能......

beginneR <- function() {
  data %>% 
    rowwise %>% 
    do(data.frame(group = .$group, 
                  yr1 = .$yr1:(.$yr2-1), 
                  yr2 = (.$yr1+1):.$yr2, 
                  val = .$val/(.$yr2 - .$yr1), stringsAsFactors = FALSE))
}

ananda <- function() {
  expandRows(
    as.data.table(
      data, keep.rownames = TRUE)[, diff := yr2 - yr1][,
        val := val/diff], "diff")[, yr1 := yr1 + sequence(.N) - 1L, 
          by = list(group, rn)][, yr2 := yr1 + 1][]
}

codoremifa <- function() {
  as.data.table(data)[,SNO := .I][, 
    val := val / (yr2 - yr1)][,
      list(yr = yr1:(yr2-1), val), by = list(group,SNO)][,
        SNO := NULL][, yr2 := yr + 1][]
}

时间......

data <- do.call(rbind, replicate(250, data, FALSE))
dim(data)
# [1] 1000    4
system.time(beginneR())
# |====================================|100% ~0 s remaining
#    user  system elapsed 
#   2.408   0.000   2.297 
system.time(ananda())
#    user  system elapsed 
#   0.000   0.000   0.017 

library(microbenchmark)
microbenchmark(ananda(), codoremifa())
# Unit: milliseconds
#          expr       min        lq      mean    median        uq      max neval
#      ananda() 16.791794 17.048305 18.096050 17.786861 18.537067 22.34243   100
#  codoremifa()  8.018706  8.201175  8.649698  8.406204  8.649132 13.87685   100

答案 2 :(得分:2)

可能不是最有效的解决方案,但它会产生所需的输出:

library(dplyr)

data %>% 
   rowwise %>% 
   do(data.frame(group = .$group, 
                 yr1 = .$yr1:(.$yr2-1L), 
                 yr2 = (.$yr1+1L):.$yr2, 
                 val = .$val/(.$yr2 - .$yr1), stringsAsFactors = FALSE))

#Source: local data frame [14 x 4]
#Groups: <by row>
#
#   group  yr1  yr2  val
#1      a 1927 1928  -20
#2      a 1928 1929  -20
#3      a 1929 1930  -20
#4      a 1930 1931  -20
#5      a 1931 1932  -20
#6      a 1932 1933  -20
#7      a 1933 1934  -20
#8      a 1953 1954 -240
#9      a 1954 1955 -240
#10     b 1957 1958 -280
#11     b 1961 1962 -355
#12     b 1962 1963 -355
#13     b 1963 1964 -355
#14     b 1964 1965 -355