公式匹配日期并划分列

时间:2018-04-28 11:08:29

标签: r

我有两个数据框,我正在尝试从中创建新数据。

df 1的colnames:

"year_t"   "data_t"   "data_t_1" "data_t_2" "data_t_3"

df 2的colnames:

 "year"       "multiplyer"

我试图除以df2中的数据。获取数据并在正确的年份首先匹配它们,将data_t除以multiplier中的相应年份df2,然后取data_t_1并将其除以multiplier year - 1 ,取data_t_2并执行相同的multiplier year - 2

output_t = data_t / multiplier
output_t_1 = data_t_1 / multiplier - 1 (year)
output_t_2 = data_t_2 / multiplier - 2 (years)
output_t_3 = data_t_3 / multiplier - 3 (years)

即(使用df1的第一行);

  year_t data_t data_t_1 data_t_2 data_t_3
1   2012 146123   162991   308060   406563

df2

中使用2012年的数据
2009    98.2319416221847
2010    100.000000000000000000000000000000
2011    103.196146412241
2012    105.720324344817

我想要得到以下内容;

df1$output_t <- 145123 / 105.720324344817
df$output_t_1 <- 162991 / 103.196146412241
df$output_t_2 <- 308060 / 100.000000000000000000000000000000
df$output_t_3 <- 406563 /  98.2319416221847

与说法相同;

output_t = data_t / multiplier (for year 2012)
output_t_1 = data_t_1 / multiplier - 1 (year 2011)
output_t_2 = data_t_2 / multiplier - 2 (years 2010)
output_t_3 = data_t_3 / multiplier - 3 (years 2009)

数据框1:

structure(list(year_t = structure(c(18L, 16L, 3L, 7L, 21L, 15L, 
2L, 21L, 2L, 17L, 17L, 3L, 14L, 13L, 15L, 18L, 19L, 14L, 13L, 
14L, 16L, 21L, 12L, 11L, 19L, 17L, 2L, 5L, 15L, 19L, 19L, 9L, 
17L, 19L, 8L, 14L, 4L, 18L, 16L, 17L, 4L, 19L, 15L, 17L, 8L, 
17L, 18L, 19L, 6L, 15L), .Label = c("1995", "1996", "1997", "1998", 
"1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", 
"2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
"2015", "2016", "2017"), class = "factor"), data_t = c(146123L, 
824675L, 78601L, 338308L, 12321527L, 301767L, 261683L, 20810L, 
438264L, 420102L, 54325L, 1557915L, 6229156L, 3312145L, 6577744L, 
1633416L, 475845L, 851586L, 658845L, 6216087L, 2069090L, 28046L, 
622088L, 244276L, 1796582L, 7607498L, 2680537L, 195115L, 1325434L, 
870810L, 399998L, 1755193L, 39280000L, 2612835L, 2643000L, 5660759L, 
201188L, 1855899L, 2393468L, 278147L, 1291212L, 4082284L, 7051999L, 
15342597L, 35949L, 1606024L, 2448224L, 91427L, 1054759L, 121252L
), data_t_1 = c(162991L, 1278341L, 433815L, 315210L, 13280222L, 
144622L, 280714L, 184286L, 349399L, 441119L, 51123L, 1112556L, 
3240105L, 2467071L, 9515093L, 1686249L, 461023L, 927146L, 507399L, 
3561613L, 1929679L, 69828L, 622204L, 242895L, 1968208L, 8453347L, 
2467278L, 92742L, 1194816L, 1197646L, 547391L, 1845368L, 38550000L, 
4555685L, 5158000L, 6324394L, 241155L, 2420718L, 3180737L, 5768459L, 
1548164L, 4318517L, 9019486L, 14386327L, 35934L, 2044495L, 2598361L, 
135402L, NA, 336379L), data_t_2 = c(308060L, 1746234L, 2473258L, 
249339L, 14327822L, NA, 259635L, 455523L, 370401L, 455568L, 40985L, 
1321363L, 1449123L, 1928196L, 9661314L, 2367151L, 375473L, 1228645L, 
420788L, 99090L, 1976669L, 150717L, NA, 271140L, 3995829L, 8166218L, 
2175989L, 87277L, 1097358L, NA, 788137L, 1695421L, 39801000L, 
4372307L, 142504L, 3439554L, 114912L, 3388745L, 2834629L, 7034688L, 
1462947L, 4537559L, 8000863L, 12737184L, 150782L, 2333824L, 2710126L, 
254109L, NA, 1140718L), data_t_3 = c(406563L, 1769192L, NA, 212706L, 
14351345L, NA, 238441L, 888216L, 255452L, 488883L, 97195L, 1106291L, 
35366L, 1388799L, 7684599L, 2425390L, 4152L, 953383L, 542362L, 
NA, 2499211L, 167215L, NA, 145238L, 8647716L, 7866078L, 1874842L, 
NA, 2158139L, NA, 736088L, 1425002L, 44633000L, 3831578L, 159060L, 
2781418L, 85171L, 3159740L, 2463385L, 8038953L, 1346105L, 4578169L, 
6277353L, 12501786L, 61561L, 2966259L, 2733420L, 298200L, NA, 
697574L)), .Names = c("year_t", "data_t", "data_t_1", "data_t_2", 
"data_t_3"), row.names = c(NA, 50L), class = "data.frame")

数据框2:

structure(list(year = c("1988", "1989", "1990", "1991", "1992", 
"1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000", 
"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", 
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", 
"2017"), multiplyer = c("45.5158032501528", "48.6069799436928", 
"51.8742543668362", "54.9525837849594", "58.208266424359", "60.8678441879755", 
"63.7398405992427", "66.7189155811034", "69.093337017433", "70.4552205020515", 
"71.7476020867733", "73.4052195938148", "75.9256001089651", "78.6512027854055", 
"81.0624649001436", "83.5261426387297", "86.064670544112", "88.9637193712232", 
"92.0911276421223", "94.6577354325631", "98.5156636153782", "98.2319416221847", 
"100.000000000000000000000000000000", "103.196146412241", "105.720324344817", 
"107.209444053708", "107.047696829365", "106.511964511277", "106.296094858635", 
"108.375335054149")), .Names = c("year", "multiplyer"), row.names = c(NA, 
-30L), class = c("tbl_df", "tbl", "data.frame"))

1 个答案:

答案 0 :(得分:2)

如果第一个表名为df1,第二个表名为df2,则应该为您提供所需内容:

library(tidyverse)


df1 %>%
    mutate(
        year_t = as.integer(as.character(year_t)),
        id = seq_len(n())
    ) %>%
    gather(time, measure, -c(year_t, id)) %>%
    mutate(
        time = as.integer(gsub('^[^0-9]+', '', gsub('t$', 't0', time))),
        year = as.character(year_t - time)
    ) %>%
    left_join(df2, by = 'year') %>%
    mutate(
        multiplyer = as.numeric(multiplyer),
        output = measure / multiplyer,
        time = paste0('weighted_t_', time)
    ) %>%
    select(id, time, output) %>%
    spread(time, output) %>%
    left_join(df1 %>% mutate(id = seq_len(n())), by = 'id') %>%
    select(year_t:data_t_3, everything())

我不得不做很多转换类型,因为它们在表之间不匹配或者没有以最佳方式存储(例如,因子而不是年份的整数或存储为字符的乘数)。