我真的很努力地在不借助循环的情况下实现这种转换。我的数据框采用长格式,因此每年,年级和学校组合都有自己的行。
我想将每年+年级+学校的值除以前一年和年级的值。下面的代码应该为我完成此操作,但是如果有50万行,则需要几天的时间才能完成。
关于如何更快地执行此操作的任何想法?
我曾尝试使用dplyr,但没有成功。与标准基础R转换方法相同。
for (i in 1:NROW(df)) {
for (j in 1:NROW(df)) {
if(df$COUNTY[i] == df$COUNTY[j] &
df$YEAR[i] == (df$YEAR[j] + 1) &
df$Grade[i] == (df$Grade[j] + 1)){
df$RATE[i] <- df$value[i] / df$value[j]
} else{
next
}
}
if(i %% 10 == 0){print(i)}
}
数据:
structure(list(YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012,
2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014,
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 0,
1, 2, 3, -1, 0, 1, 2, 3), COUNTY = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("001", "002", "003", "004", "005", "006", "007",
"008", "009", "010", "011", "012", "013", "014", "015", "016",
"017", "018", "019", "020", "021", "022", "023", "024", "025",
"026", "027", "028", "029", "030", "031", "032", "033", "034",
"035", "036", "037", "038", "039", "040", "041", "042", "043",
"044", "045", "046", "047", "048", "049", "050", "051", "052",
"053", "054", "055", "056", "057", "058", "059", "060", "061",
"062", "063", "064", "065", "066", "067", "068", "069", "070",
"071", "072", "073", "074", "075", "076", "077", "078", "079",
"080", "081", "082", "083", "084", "085", "086", "087", "088",
"089", "090", "091", "092", "093", "094", "095", "096", "097",
"098", "099", "100", "101", "102", "103", "104", "105", "106",
"107", "108", "109", "110", "111", "112", "113", "114", "115",
"126", "145", "166", "201", "347", "401", "640", "KCS"), class = "factor"),
value = c(178, 212, 208, 208, 242, 199, 230, 227, 208, 208,
187, 245, 235, 216, 204, 189, 235, 250, 226, 217)), row.names = c(NA,
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x000001d7929a1ef0>, groups = structure(list(
YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012,
2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014,
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1,
0, 1, 2, 3, -1, 0, 1, 2, 3), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame")))
答案 0 :(得分:2)
使用data.table
library(data.table)
setDT(df)[order(YEAR), Ratio := value/shift(value) , .(COUNTY, Grade)]
答案 1 :(得分:1)
最简单的方法是消除内部for-loop
并使用向量化函数。
示例:
df$RATE2 <- 0
for(i in seq(nrow(df))){
indx <- which(df$COUNTY[i] == df$COUNTY &
df$YEAR[i] == (df$YEAR + 1) &
df$Grade[i] == (df$Grade + 1))
if((n <- length(indx)) > 1)
stop("Error, rowcount too great!")
else if(n == 1)
df$RATE2[i] <- df$value[i] / df$value[indx]
}
all.equal(df$RATE, df$RATE2)
[1] TRUE
请注意,&
将对向量中的所有元素执行比较,因此逻辑语句将为数据帧中的每一行返回TRUE
或FALSE
。为了方便起见(通常不浪费太多时间),我使用which
将其转换为索引向量,并且如果长度仅为1(不覆盖多个元素),则覆盖{{1} }和适当的索引。
比较:
RATE2
从中值时间可以看到,我们现在仅使用原始时间的microbenchmark:::microbenchmark(original = {
df$RATE <- 0
for (i in 1:NROW(df)) {
for (j in 1:NROW(df)) {
if(df$COUNTY[i] == df$COUNTY[j] &&
df$YEAR[i] == (df$YEAR[j] + 1) &&
df$Grade[i] == (df$Grade[j] + 1)){
df$RATE[i] <- df$value[i] / df$value[j]
}
}
}
}, improved = {
df$RATE2 <- 0
for(i in seq(nrow(df))){
indx <- which(df$COUNTY[i] == df$COUNTY & df$YEAR[i] == (df$YEAR + 1) & df$Grade[i] == (df$Grade + 1))
if((n <- length(indx)) > 1)
stop("Error, rowcount too great!")
else if(n == 1)
df$RATE2[i] <- df$value[i] / df$value[indx]
}
})
#output:
Unit: milliseconds
expr min lq mean median uq max neval
original 15.452877 19.751258 26.944155 23.750028 33.886566 70.93348 100
improved 1.020224 1.221664 2.121345 1.730265 2.311173 17.56658 100
进行计算。
请注意,使用1.73/23.75 * 100 = 7.3 %
不会不能加快此过程,这是通过使用矢量化函数来完成的。还要注意,我稍微更改了原始功能的代码以使用apply
,并删除了多余的&&
部分。这样可以稍微加快此代码版本的速度。
答案 2 :(得分:1)
您可以在dplyr
中做到这一点,这应该很快……
library(dplyr)
df <- df %>% group_by(COUNTY, Grade) %>% #for your df above, but replace with SCHOOL or whatever
arrange(YEAR) %>% #sort by increasing year
mutate(Ratio = value/lag(value)) #value for year / value for previous year
df
YEAR Grade COUNTY value Ratio
<dbl> <dbl> <fct> <dbl> <dbl>
1 2011 -1 001 178 NA
2 2011 0 001 212 NA
3 2011 1 001 208 NA
4 2011 2 001 208 NA
5 2011 3 001 242 NA
6 2012 -1 001 199 1.12
7 2012 0 001 230 1.08
8 2012 1 001 227 1.09
9 2012 2 001 208 1
10 2012 3 001 208 0.860
11 2013 -1 001 187 0.940
12 2013 0 001 245 1.07
13 2013 1 001 235 1.04
14 2013 2 001 216 1.04
15 2013 3 001 204 0.981
16 2014 -1 001 189 1.01
17 2014 0 001 235 0.959
18 2014 1 001 250 1.06
19 2014 2 001 226 1.05
20 2014 3 001 217 1.06
答案 3 :(得分:1)
只需使用向量化的ifelse
移列即可。下面假设 COUNTY 是一个因子变量(不是字符):
# SHIFT COLUMNS FORWARD
df$COUNTY_SHIFT <- factor(levels(df$COUNTY)[c(0, df$COUNTY[1:(nrow(df)-1)])]
df$YEAR_SHIFT <- c(NA, df$YEAR[1:(nrow(df)-1)])
df$Grade_SHIFT <- c(NA, df$Grade[1:(nrow(df)-1)])
df$value_SHIFT <- c(NA, df$value[1:(nrow(df)-1)])
# CONDITIONALLY ASSIGN
df$RATE <- ifelse(df$COUNTY == df$COUNTY_SHIFT &
df$YEAR == df$YEAR_SHIFT &
df$Grade == df$Grade_SHIFT,
df$value / df$value_SHIFT,
NA)
或全部在within
上下文中:
df <- within(df, {
# SHIFT COLUMNS FORWARD
COUNTY_SHIFT <- factor(levels(COUNTY)[c(0, COUNTY[1:(nrow(df)-1)])]
YEAR_SHIFT <- c(NA, YEAR[1:(nrow(df)-1)])
Grade_SHIFT <- c(NA, Grade[1:(nrow(df)-1)])
value_SHIFT <- c(NA, value[1:(nrow(df)-1)])
# CONDITIONALLY ASSIGN
RATE <- ifelse(COUNTY == COUNTY_SHIFT &
YEAR == YEAR_SHIFT &
Grade == Grade_SHIFT,
value / value_SHIFT,
NA)
# REMOVE HELPER COLUMNS
rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})
或者,在移位的数据帧上合并:
df$ID <- 1:nrow(df)
shifted_df <- merge(transform(df, ID=ID-1), df[-1,], by="ID", suffixes=c("", "_SHIFT"))
final_df <- within(shifted_df , {
# CONDITIONALLY ASSIGN
RATE <- ifelse(COUNTY == COUNTY_SHIFT &
YEAR == YEAR_SHIFT &
Grade == Grade_SHIFT,
value / value_SHIFT,
NA)
# REMOVE HELPER COLUMNS
rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})