高效算法,无需循环即可计算data.frame中的值

时间:2015-12-03 21:13:15

标签: r dataframe

在这种情况下,我有点坚持使用R.我每天都有一行数据表,如下所示:

Date = c(as.Date("2015-12-31"), as.Date("2016-01-01"));
Month1 = c("DEC", "JAN");
Year1 = c("15", "16");
Price1 = c(100, 110);
Month2 = c(NA_character_, NA_character_);
Year2 = c(NA_character_, NA_character_);
Price2 = c(NA_integer_, NA_integer_);
Month3 = c(NA_character_, NA_character_);
Year3 = c(NA_character_, NA_character_);
Price3 = c(NA_integer_, NA_integer_);
Month4 = c(NA_character_, NA_character_);
Year4 = c(NA_character_, NA_character_);
Price4 = c(NA_integer_, NA_integer_);

dataSample = data.frame(Date, Month1, Year1, Price1, Month2, Year2, Price2, Month3, Year3, Price3, Month4, Year4, Price4);

给出了这样一个表格:

        Date Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
1 2015-12-31    DEC    15    100   <NA>  <NA>     NA   <NA>  <NA>     NA   <NA>  <NA>     NA

2 2016-01-01    JAN    16    110   <NA>  <NA>     NA   <NA>  <NA>     NA   <NA>  <NA>     NA

现在我需要计算每个月的所有月份和价格。为此,我有2个其他数据框:

Date = c(as.Date("2015-12-31"), as.Date("2015-12-31"), as.Date("2015-12-31"), as.Date("2016-01-01"), as.Date("2016-01-01"), as.Date("2016-01-01"));
Month.Start =  c("DEC", "JAN", "FEB", "JAN", "FEB", "MAR");
Year.Start =  c("15", "16", "16", "16", "16", "16")
Month.End =  c("JAN", "FEB", "MAR", "FEB", "MAR", "APR");
Year.End =  c("16", "16", "16", "16", "16", "16")
Diff =  c(10, 15, -15, 19, -20, -5);


diffsOneMonth = data.frame(Date, Month.Start, Year.Start, Month.End, Year.End, Diff)

Date = c(as.Date("2015-12-31"), as.Date("2016-01-01"));
Month.Start =  c("DEC", "MAR");
Year.Start =  c("15", "16")
Month.End =  c("MAR", "JUN");
Year.End =  c("16", "16")
Diff =  c(11, 25);


diffsThreeMonth = data.frame(Date, Month.Start, Year.Start, Month.End, Year.End, Diff)

这给了我这些表格:

One month price differences
      Date          Month.Start Year.Start Month.End Year.End Diff
    1 2015-12-31         DEC         15       JAN       16   10
    2 2015-12-31         JAN         16       FEB       16   15
    3 2015-12-31         FEB         16       MAR       16  -15
    4 2016-01-01         JAN         16       FEB       16   19
    5 2016-01-01         FEB         16       MAR       16  -20
    6 2016-01-01         MAR         16       APR       16   -5

Three month price differences
            Date Month.Start Year.Start Month.End Year.End Diff
    1 2015-12-31         DEC         15       MAR       16   20
    2 2016-01-01         MAR         16       JUN       16   25

现在,我必须使用差异表中的数据填充 dataSample 数据框。我检查那里有哪些开始/结束月/年,并且必须在 dataSample 中填写这些月/年。然后在 dataSample 中获取价格差异并设置计算价格。所以例如在 dataSample 中我们从DEC 15开始,然后在 diffsOneMonth 中我们有条目DEC 15 - JAN 16,差异为10所以我们将它添加到DEC 15价格并获得JAN 16价格110:

    Date      Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
1 2015-12-31    DEC    15    100   JAN   16     110   <NA>  <NA>     NA   <NA>  <NA>     NA
2 2016-01-01    JAN    16    110   <NA>  <NA>     NA   <NA>  <NA>     NA   <NA>  <NA>     NA

现在可以在下个月再做下一个等等。如果我们只使用 diffsOneMonth ,我们会得到这样的理想结果:

 Date             Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
    1 2015-12-31    DEC    15    100   JAN   16     110     FEB  16     125      MAR  16     110
    2 2016-01-01    JAN    16    110   FEB   16     129     MAR  16     109      APR  16     104

然而,如果可能的话,我还必须使用更宽的月份差价来计算价格。因此,对于2015-12-31,从DEC 15到MAR 16存在三个月的差价,这应该超过一个月差价的价格。所以DEC 15的价格是110而DEC 15 - MAR 16的差价是11,这使得MAR 16的价格不是110而是111:

      Date             Month1 Year1 Price1 Month2 Year2 Price2 Month3 Year3 Price3 Month4 Year4 Price4
        1 2015-12-31    DEC    15    100   JAN   16     110     FEB  16     125      MAR  16     111
        2 2016-01-01    JAN    16    110   FEB   16     129     MAR  16     109      APR  16     104

因此,对于这个样本,它将是我最终的理想输出。  实际数据要复杂得多,每个日期有6个月和12个月的差异,前进64个月。也可能缺少几个月。我尝试用循环来做但它很慢,但是我不知道如何在没有循环的情况下解决这样的问题。我已经创建了一些辅助方法来计算明年/月:

nextContract = function(currentMonth, currentYear, length = 1,
                        years = c("10", "11", "12", "13", "14", "15", "16", "17", "18"),
                        months = c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")) {
  mIdx <- match(currentMonth, months)+length;
  yDiff = ifelse(length(months) < mIdx, mIdx / length(months) - ifelse(mIdx %% length(months) == 0, 1, 0), 0);
  return(data.frame(nextMonth(currentMonth, length, months), nextYear(currentYear, length = yDiff)))
}

nextYear = function(currentYear, length = 1, years = c("10", "11", "12", "13", "14", "15", "16", "17", "18")) {
  return(years[match(currentYear, years)+length]);
}

nextMonth = function(currentMonth, length = 1, months = c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")) {
  mIdx <- match(currentMonth, months)+length;
  return(months[ifelse(length(months) < mIdx, ifelse(mIdx %% length(months) != 0, mIdx %% length(months), length(months)), mIdx)]);
}

使用示例可以是:

> nextContract("DEC", "15")
  nextMonth.currentMonth..length..months. nextYear.currentYear..length...yDiff.
1                                     JAN                                    16

或:

> nextContract("DEC", "15", length = 3)
  nextMonth.currentMonth..length..months. nextYear.currentYear..length...yDiff.
1                                     MAR                                    16

这是一个很长的问题,但我希望有人会花时间来审查它:)

提前致谢!

修改 对提议的解决方案有一点改进,我得到了我需要的东西:

    outrightAndForwardRows <- list("1" = diffsOneMonth, "3" = diffsThreeMonth) %>%
  bind_rows(.id = "time_step") %>%
  left_join(dataSample %>%
              select(Date, Price1, Month1, Year1) ) %>%
  mutate(Day.Start = 1) %>%
  mutate(Day.End = 1) %>%
  mutate(Outright.Day = 1) %>%
  unite("Contract.Start", Day.Start, Month.Start, Year.Start) %>%
  unite("Contract.End", Day.End, Month.End, Year.End) %>%
  unite("Contract.Outright", Outright.Day, Month1, Year1) %>%
  mutate(time_step = as.numeric(time_step),
         Contract.Start =
           Contract.Start %>%
           parse_date_time("%d_%b_%y")) %>%
  mutate(Contract.End =
           Contract.End %>%
           parse_date_time("%d_%b_%y")) %>%
  mutate(Contract.Outright =
           Contract.Outright %>%
           parse_date_time("%d_%b_%y")) %>%
  group_by(time_step, Date) %>%
  arrange(Contract.End) %>%
  mutate(Price = cumsum(Diff) + Price1) %>%
  group_by(Date, Contract.End) %>%
  slice(time_step %>% which.max) %>%
  ungroup() %>%
  select(-time_step, -Diff, -Contract.Start)

#### add outright and forward months to the same columns
outright <- outrightAndForwardRows %>% select(Date, Price=Price1, Contract=Contract.Outright) %>% unique
forwardMonths <- outrightAndForwardRows %>% select(Date, Contract=Contract.End, Price)
# join and sort rows
joined <- rbind(outright, forwardMonths) %>% arrange(Date, Contract)
# add contract sequence
joined = data.table(joined)
joined = joined[, Contract.seq:=seq(.N), by=Date];

dcast(joined, Date ~ Contract.seq, value.var=c("Price", "Contract"))

1 个答案:

答案 0 :(得分:1)

这样的事情:

library(dplyr)
library(tidyr)
library(lubridate)

list(`1` = diffsOneMonth,
    `3` = diffsThreeMonth) %>%
  bind_rows(.id = "time_step") %>%
  left_join(dataSample %>%
              select(Date, Price1, Month1, Year1) ) %>%
  mutate(Day.Start = 1) %>%
  unite("Date.Start", Day.Start, Month.Start, Year.Start) %>%
  mutate(time_step = as.numeric(time_step),
         Date.Start =
           Date.Start %>%
           parse_date_time("%d_%b_%y")) %>%
  group_by(time_step, Date) %>%
  arrange(Date.Start) %>%
  mutate(Price = cumsum(Diff) + Price1) %>%
  group_by(Date, Date.Start) %>%
  slice(time_step %>% which.max)