我有一个称为df的数据框,如下所示:
country year taxrevenue VAT CIT
Angola 2006 1000 750 250
Albania 1998 1000 750 250
Albania 2002 1500 1125 375
Albania 2005 1200 900 300
Argentina 1984 1900 1425 475
Argentina 1991 1400 1050 350
Argentina 1995 1600 1200 400
我想做的是为每个(税收)列创建一个附加列,该列包含一年中的税收与下一年中的税收之间的差额,条件是该国家/地区是同一国家,例如:>
country year dtaxrevenue dvat dcit
Angola 2006 NA NA NA
Albania 1998 NA NA NA
Albania 2002 500 375 125
Albania 2005 -300 -225 -75
Argentina 1984 NA 525 175
Argentina 1991 -500 -375 -125
Argentina 1995 200 150 50
我想到以下几点:
#for each column in df
for(i in 1:ncol(TRcomplete)){
#add a new column
TRcomplete <- cbind(TRcomplete[,i])
# When condition holds..
if(TRcomplete$year[,i]- TRcomplete$year[,i+1] < 0) & TRcomplete$country[,i] == TRcomplete$country[,i+1]{
# try to subtract two values and put them in a new column
try(TRcomplete[,i+1] <- TRcomplete[,i]- TRcomplete[,i+1]
} else {
TRcomplete[,i+1]<-NA
}
我对如何引用每一栏感到困惑。有人可以帮忙吗?
答案 0 :(得分:1)
使用dplyr
我们可以尝试
library(dplyr)
df %>% group_by(country) %>%
mutate(Tax = taxrevenue-lag(taxrevenue))
# A tibble: 7 x 6
# Groups: country [3]
country year taxrevenue VAT CIT Tax
<chr> <int> <int> <int> <int> <int>
1 Angola 2006 1000 750 250 NA
2 Albania 1998 1000 750 250 NA
3 Albania 2002 1500 1125 375 500
4 Albania 2005 1200 900 300 -300
5 Argentina 1984 1900 1425 475 NA
6 Argentina 1991 1400 1050 350 -500
7 Argentina 1995 1600 1200 400 200
df %>%
group_by(country) %>%
mutate_if(is.numeric, funs(d = . - lag(.))) %>% select(-year_d) %>%
rename_at(vars(ends_with("_d")), funs(paste("d", gsub("_d", "", .), sep = "") )) %>%
head(n=5)
# A tibble: 5 x 8
# Groups: country [3]
country year taxrevenue VAT CIT dtaxrevenue dVAT dCIT
<fct> <int> <int> <int> <int> <int> <int> <int>
1 Angola 2006 1000 750 250 NA NA NA
2 Albania 1998 1000 750 250 NA NA NA
3 Albania 2002 1500 1125 375 500 375 125
4 Albania 2005 1200 900 300 -300 -225 -75
5 Argentina 1984 1900 1425 475 NA NA NA
答案 1 :(得分:1)
一个带有data.table的版本。
# make data
library(data.table)
dt <- data.table(
country = c("Angola", "Albania", "Albania", "Albania", "Argentina", "Argentina", "Argentina"),
year = c(2006, 1998, 2002, 2005, 1984, 1991, 1995),
tax = c(1000, 1000, 1500, 1200, 1900, 1400, 1600),
vat = c(750, 750, 1125, 900, 1425, 1050, 1200),
cit = c(250, 250, 375, 300, 475, 350, 400)
)
# get lagged taxes
dt <- dt[order(country, year)]
dt[ , lag_tax := shift(tax), by=country]
dt[ , lag_vat := shift(vat), by=country]
dt[ , lag_cit := shift(cit), by=country]
# calculate tax differences
dt[ , dtax := tax - lag_tax]
dt[ , dvat := vat - lag_vat]
dt[ , dcit := cit - lag_cit]
如果您需要对许多列进行此操作,请使用lapply
和.SD
:
# get lagged cols
dt[ , paste0(names(dt)[3:5], "_lag") := lapply(.SD, shift), by=country, .SDcols=3:5]
# or go straight to differences
dt[ , paste0(names(dt)[3:5], "_diff") := lapply(.SD, function(x) x - shift(x)), by=country, .SDcols=3:5]
答案 2 :(得分:1)
mutate_at
是候选人
library(dplyr)
df %>%
arrange(country, year) %>% #sort data
group_by(country) %>%
mutate_at(vars(taxrevenue:CIT), funs(d = . - lag(.)))
给出
country year taxrevenue VAT CIT taxrevenue_d VAT_d CIT_d
1 Albania 1998 1000 750 250 NA NA NA
2 Albania 2002 1500 1125 375 500 375 125
3 Albania 2005 1200 900 300 -300 -225 - 75
4 Angola 2006 1000 750 250 NA NA NA
5 Argentina 1984 1900 1425 475 NA NA NA
6 Argentina 1991 1400 1050 350 -500 -375 -125
7 Argentina 1995 1600 1200 400 200 150 50
示例数据
df <- structure(list(country = c("Angola", "Albania", "Albania", "Albania",
"Argentina", "Argentina", "Argentina"), year = c(2006L, 1998L,
2002L, 2005L, 1984L, 1991L, 1995L), taxrevenue = c(1000L, 1000L,
1500L, 1200L, 1900L, 1400L, 1600L), VAT = c(750L, 750L, 1125L,
900L, 1425L, 1050L, 1200L), CIT = c(250L, 250L, 375L, 300L, 475L,
350L, 400L)), .Names = c("country", "year", "taxrevenue", "VAT",
"CIT"), class = "data.frame", row.names = c(NA, -7L))