我有一个非常大的data.frame(数千个变量),每行有一个标识符和一年。一行可能会出现几年或出现在数据的中间。缺少一年(1997年),我想以某种方式插入所有数字变量的值:
示例数据:
test_df <- data.frame(id = c(1,2,3,1,3), year = c(96,96,96,98,98),
state = c("MA","MD","NY","MA", "NY"),
num1 = c(10,11,22,9,27), num2 = c(11566,32340,97555,14200,100025))
> test_df
id year state num1 num2
1 1 96 MA 10 11566
2 2 96 MD 11 32340
3 3 96 NY 22 97555
4 1 98 MA 9 14200
5 3 98 NY 27 100025
最终数据应为:
id year state num1 num2
1 1 96 MA 10 11566
2 2 96 MD 11 32340
3 3 96 NY 22 97555
4 1 97 MA 9.5 12883
5 3 97 NY 24.5 98790
6 1 98 MA 9 14200
7 3 98 NY 27 100025
到目前为止,我所做的是将上一年的行设置为与明年具有相同ID的行,并选择数字变量。在计算之后,我只会rbind
他们到主数据。
common_ids <- test_df[test_df$year==1996,]
common_ids <- common_ids[test_df[test_df$year==1996,]$id %in% test_df[test_df$year==1998,]$id,]
numeric_vars <- sapply(common_ids,is.numeric)
common_ids[,numeric_vars] <- lapply(common_ids[,numeric_vars], function(x)???)
最后的问题 - 如果连续两年失踪,解决方案是否也会有效?
谢谢!
答案 0 :(得分:1)
使用data.table
和zoo
,你可以从这样的事情开始
library(data.table)
library(zoo)
test_df <- data.table(id = c(1,2,3,1,3), year = c(96,96,96,98,98),
state = c("MA","MD","NY","MA", "NY"),
num1 = c(10,11,22,9,27), num2 = c(11566,32340,97555,14200,100025))
test_df <- test_df[order(id, year)]
missing.ids <- test_df[, c(NA, id[-.N]), by = id][!is.na(V1),V1]
temp_df <- data.table(id = missing.ids, year = rep(97, length(missing.ids)), state = NA, num1 = NA, num2 = NA)
new.test_df <- rbind(test_df, temp_df)[order(id, year)]
new.test_df[, state := na.locf(state, na.rm = FALSE), by = id]
new.test_df[, `:=` (num1 = na.approx(num1, na.rm = FALSE), num2 = na.approx(num2, na.rm = FALSE)), by = id]
修改强> 没有命名特定变量
library(data.table)
library(zoo)
test_df <- data.table(id = c(1,2,3,1,3), year = c(96,96,96,98,98),
state = c("MA","MD","NY","MA", "NY"),
num1 = c(10,11,22,9,27), num2 = c(11566,32340,97555,14200,100025))
test_df <- test_df[order(id, year)]
mynum.cols <- names(test_df)[!(names(test_df) %in% c("id", "year", "state"))]
missing.ids <- test_df[, c(NA, id[-.N]), by = id][!is.na(V1),V1]
temp_df <- data.table(id = missing.ids, year = rep(97, length(missing.ids)), state = NA,
data.table(matrix(NA, nrow = length(missing.ids), ncol = length(mynum.cols),
dimnames = list(rep(NA, length(missing.ids)), mynum.cols))))
new.test_df <- rbind(test_df, temp_df)[order(id, year)]
new.test_df[, state := na.locf(state, na.rm = FALSE), by = id]
new.test_df[, (mynum.cols) := lapply(.SD, function(x) na.approx(x, na.rm = FALSE)), by = id, .SDcols = mynum.cols]
new.test_df <- new.test_df[order(year, id)]
new.test_df