Question

我有一个非常大的data.frame（数千个变量），每行有一个标识符和一年。一行可能会出现几年或出现在数据的中间。缺少一年（1997年），我想以某种方式插入所有数字变量的值：

复制上一年（1996年）中存在标识符的所有行明年（1998年）。
对于所有数字变量，计算上一年的变量与下一年的平均值 - 特定的两个相应行（具有相同的标识符）。
由于这是一个非常大的数据集，我急于避免使用循环。

示例数据：

test_df <- data.frame(id = c(1,2,3,1,3), year = c(96,96,96,98,98), 
                      state = c("MA","MD","NY","MA", "NY"),
                      num1 = c(10,11,22,9,27), num2 = c(11566,32340,97555,14200,100025))
> test_df
  id year state num1   num2
1  1   96    MA   10  11566
2  2   96    MD   11  32340
3  3   96    NY   22  97555
4  1   98    MA    9  14200
5  3   98    NY   27 100025

最终数据应为：

  id year state num1   num2
1  1   96    MA   10  11566
2  2   96    MD   11  32340
3  3   96    NY   22  97555
4  1   97    MA  9.5  12883
5  3   97    NY 24.5  98790
6  1   98    MA    9  14200
7  3   98    NY   27 100025

到目前为止，我所做的是将上一年的行设置为与明年具有相同ID的行，并选择数字变量。在计算之后，我只会rbind他们到主数据。

common_ids <- test_df[test_df$year==1996,]
common_ids <- common_ids[test_df[test_df$year==1996,]$id %in% test_df[test_df$year==1998,]$id,]
numeric_vars <- sapply(common_ids,is.numeric)

common_ids[,numeric_vars] <- lapply(common_ids[,numeric_vars], function(x)???)

最后的问题 - 如果连续两年失踪，解决方案是否也会有效？

谢谢！

Answer 1

使用data.table和zoo，你可以从这样的事情开始

library(data.table)
library(zoo)

test_df <- data.table(id = c(1,2,3,1,3), year = c(96,96,96,98,98), 
                  state = c("MA","MD","NY","MA", "NY"),
                  num1 = c(10,11,22,9,27), num2 = c(11566,32340,97555,14200,100025))

test_df <- test_df[order(id, year)]

missing.ids <- test_df[, c(NA, id[-.N]), by = id][!is.na(V1),V1]

temp_df <- data.table(id = missing.ids, year = rep(97, length(missing.ids)), state = NA, num1 = NA, num2 = NA)

new.test_df <- rbind(test_df, temp_df)[order(id, year)]

new.test_df[, state := na.locf(state, na.rm = FALSE), by = id]
new.test_df[, `:=` (num1 = na.approx(num1, na.rm = FALSE), num2 = na.approx(num2, na.rm = FALSE)), by = id]

修改没有命名特定变量

library(data.table) library(zoo) test_df <- data.table(id = c(1,2,3,1,3), year = c(96,96,96,98,98), state = c("MA","MD","NY","MA", "NY"), num1 = c(10,11,22,9,27), num2 = c(11566,32340,97555,14200,100025)) test_df <- test_df[order(id, year)] mynum.cols <- names(test_df)[!(names(test_df) %in% c("id", "year", "state"))] missing.ids <- test_df[, c(NA, id[-.N]), by = id][!is.na(V1),V1] temp_df <- data.table(id = missing.ids, year = rep(97, length(missing.ids)), state = NA, data.table(matrix(NA, nrow = length(missing.ids), ncol = length(mynum.cols), dimnames = list(rep(NA, length(missing.ids)), mynum.cols)))) new.test_df <- rbind(test_df, temp_df)[order(id, year)] new.test_df[, state := na.locf(state, na.rm = FALSE), by = id] new.test_df[, (mynum.cols) := lapply(.SD, function(x) na.approx(x, na.rm = FALSE)), by = id, .SDcols = mynum.cols] new.test_df <- new.test_df[order(year, id)] new.test_df

根据相同id的上一行和下一行计算空行的所有数值变量的值

1 个答案: