我正在寻找一种将数据帧分成几行的方法。
我的测试输入数据如下所示
data <- read.table(text ="group; yr1; yr2; val; col2
a; 1927; 1934; -140; coltest
a; 1953; 1955; -480; coltest
b; 1957; 1958; -280; coltest1
b; 1961; 1965; -1420; coltest1 ", sep=";", header=T,stringsAsFactors = FALSE)
我正在寻找的是一种计算每年价值并将其写成这样一行的方法:
group; yr1; yr2; val; col2
a; 1927; 1928; -20; coltest
a; 1928; 1929; -20; coltest
a; 1929; 1930; -20; coltest
a; 1930; 1931; -20; coltest
a; 1931; 1932; -20; coltest
a; 1932; 1933; -20; coltest
a; 1933; 1934; -20; coltest
a; 1953; 1954; -240; coltest
a; 1954; 1955; -240; coltest
b; 1957; 1958; -280; coltest1
b; 1961; 1962; -355; coltest1
b; 1962; 1963; -355; coltest1
b; 1963; 1964; -355; coltest1
b; 1964; 1965; -355; coltest1
我可以像这样计算一年的每个值,但无法将其拆分为单独的行。
data$new <- data$val/(data$yr2-data$yr1)
答案 0 :(得分:5)
library(data.table)
setDT(data)
data[,SNO := .I]
data[,val := val / (yr2 - yr1)]
(data[,
list(yr = yr1:(yr2-1), val),
by = list(group,SNO)
][,
SNO := NULL
][,
yr2 := yr + 1]
)
输出
# group yr val yr2
# 1: a 1927 -20 1928
# 2: a 1928 -20 1929
# 3: a 1929 -20 1930
# 4: a 1930 -20 1931
# 5: a 1931 -20 1932
# 6: a 1932 -20 1933
# 7: a 1933 -20 1934
# 8: a 1953 -240 1954
# 9: a 1954 -240 1955
# 10: b 1957 -280 1958
# 11: b 1961 -355 1962
# 12: b 1962 -355 1963
# 13: b 1963 -355 1964
# 14: b 1964 -355 1965
答案 1 :(得分:3)
可以使用我的“splitstackshape”包中的expandRows
以及带有“data.table”的一些复合语句:
library(splitstackshape)
expandRows(
as.data.table(
data, keep.rownames = TRUE)[, diff := yr2 - yr1][,
val := val/diff], "diff")[, yr1 := yr1 + sequence(.N) - 1L,
by = list(group, rn)][, yr2 := yr1 + 1][]
# rn group yr1 yr2 val
# 1: 1 a 1927 1928 -20
# 2: 1 a 1928 1929 -20
# 3: 1 a 1929 1930 -20
# 4: 1 a 1930 1931 -20
# 5: 1 a 1931 1932 -20
# 6: 1 a 1932 1933 -20
# 7: 1 a 1933 1934 -20
# 8: 2 a 1953 1954 -240
# 9: 2 a 1954 1955 -240
# 10: 3 b 1957 1958 -280
# 11: 4 b 1961 1962 -355
# 12: 4 b 1962 1963 -355
# 13: 4 b 1963 1964 -355
# 14: 4 b 1964 1965 -355
与@ beginneR的方法相比,这更有效,但纯粹的“data.table”方法更快。
以下是仅1000行的比较:
功能......
beginneR <- function() {
data %>%
rowwise %>%
do(data.frame(group = .$group,
yr1 = .$yr1:(.$yr2-1),
yr2 = (.$yr1+1):.$yr2,
val = .$val/(.$yr2 - .$yr1), stringsAsFactors = FALSE))
}
ananda <- function() {
expandRows(
as.data.table(
data, keep.rownames = TRUE)[, diff := yr2 - yr1][,
val := val/diff], "diff")[, yr1 := yr1 + sequence(.N) - 1L,
by = list(group, rn)][, yr2 := yr1 + 1][]
}
codoremifa <- function() {
as.data.table(data)[,SNO := .I][,
val := val / (yr2 - yr1)][,
list(yr = yr1:(yr2-1), val), by = list(group,SNO)][,
SNO := NULL][, yr2 := yr + 1][]
}
时间......
data <- do.call(rbind, replicate(250, data, FALSE))
dim(data)
# [1] 1000 4
system.time(beginneR())
# |====================================|100% ~0 s remaining
# user system elapsed
# 2.408 0.000 2.297
system.time(ananda())
# user system elapsed
# 0.000 0.000 0.017
library(microbenchmark)
microbenchmark(ananda(), codoremifa())
# Unit: milliseconds
# expr min lq mean median uq max neval
# ananda() 16.791794 17.048305 18.096050 17.786861 18.537067 22.34243 100
# codoremifa() 8.018706 8.201175 8.649698 8.406204 8.649132 13.87685 100
答案 2 :(得分:2)
可能不是最有效的解决方案,但它会产生所需的输出:
library(dplyr)
data %>%
rowwise %>%
do(data.frame(group = .$group,
yr1 = .$yr1:(.$yr2-1L),
yr2 = (.$yr1+1L):.$yr2,
val = .$val/(.$yr2 - .$yr1), stringsAsFactors = FALSE))
#Source: local data frame [14 x 4]
#Groups: <by row>
#
# group yr1 yr2 val
#1 a 1927 1928 -20
#2 a 1928 1929 -20
#3 a 1929 1930 -20
#4 a 1930 1931 -20
#5 a 1931 1932 -20
#6 a 1932 1933 -20
#7 a 1933 1934 -20
#8 a 1953 1954 -240
#9 a 1954 1955 -240
#10 b 1957 1958 -280
#11 b 1961 1962 -355
#12 b 1962 1963 -355
#13 b 1963 1964 -355
#14 b 1964 1965 -355