我试图用这个基本的例子解决一个更大的问题。我需要根据which()
的位置应用一个函数,因为我需要知道df1
的年份,其值为NA
或> = 150.然后我将{{1}得到均值,并将其返回到确切的行。现在我使用df2
循环并且需要更快的东西,因为我拥有的数据非常大。有没有通用的方法来做到这一点?
dput:
for()
代码:
df1 <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, NA, 37.94, 10.94,
NA, 28.04, 64.94, 41, 200, 51.08)), .Names = c("id", "element",
"year", "month", "day", "value"), row.names = c(NA, -10L), class = c("tbl_df",
"data.frame"))
df2 <-structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, 10.94, 37.94, 10.94,
12, 28.04, 64.94, 41, 82.04, 51.08)), row.names = c(NA, -10L
), class = c("tbl_df", "data.frame"), .Names = c("id", "element",
"year", "month", "day", "value"))
答案 0 :(得分:3)
我建议data.table
的有效二进制连接结合修改(使用:=
),同时指定by = .EACHI
(为了分别计算每个组的平均值)。
library(data.table)
setDT(df1)[setDT(df2),
value := ifelse(is.na(value) | value >= 150, mean(i.value), value),
on = "year",
by = .EACHI]
df1
# id element year month day value
# 1: USC00031632 TMAX 1900 1 1 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08
或者,我们可以分两步完成此操作,以便在每一步中避免ifelse
开销
setDT(df1)[setDT(df2), value2 := i.value, on = "year"]
df1[is.na(value) | value >= 150, value := mean(value2), by = year]
df1
# id element year month day value value2
# 1: USC00031632 TMAX 1900 1 1 30.02 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08 51.08
如果您希望使用value2
df1[, value2 := NULL]