Question

请耐心等待，因为这是我的第一个问题。我还在试图弄清楚如何发布数据和我已经拥有的代码，所以现在我只想解释一下。如果这不是提出问题的可接受方式，请忽略这个问题，下次我会尝试做正确的事。

我有一个数据框，我想做每日计算。在特定的一天，我已经拥有OpenUnits，BuyUnits，SellUnits，CloseUnits和Interest。这些值是由另一个系统计算的。我需要将每日利息比例与销售的单位数量进行比较。我可以进行计算，但我无法弄清楚如何获得OpenInterest（前一天的关闭），而不在数据框上使用for循环。 ClosingInterest应该是OpenInterest + Interest - SellUnits / OpenUnits * OpenInterest

我尝试使用mutate（OpenInterest = lag（ClosingInterest），ClosingInterest = OpenInterest + Interest - SellUnits / OpenUnits * OpenInterest），但这似乎不起作用。

我的代码使用for循环，但我希望可能有更好，更快的方法。

此致

library(tidyverse)
library(tibbletime)
library(lubridate)

sample <- list(OpenUnits = c(7500000, 7500000, 7500000, 7500000, 7500000, 
                             3300000, 3300000, 3300000, 3300000, 3300000), ClosingUnits = c(7500000, 
                                                                                            7500000, 7500000, 7500000, 3300000, 3300000, 3300000, 3300000, 
                                                                                            3300000, 3300000), AccrualDate = 16892:16901, AiaAdjustAmt = c(1844.70359677349, 
                                                                                                                                                           1845.18465061665, 1845.66582990696, 1846.14713467713, 812.516568582349, 
                                                                                                                                                           812.728453146696, 812.940392965385, 813.152388052826, 813.364438423431, 
                                                                                                                                                           813.576544091616), SellUnits = c(NA, NA, NA, NA, 4200000, NA, 
                                                                                                                                                                                            NA, NA, NA, NA))
sample <- sample %>%
  as_tibble() %>% 
  mutate(
    AccrualDate = lubridate::as_date(AccrualDate),
    SellUnits = if_else(is.na(SellUnits), 0, SellUnits)
  ) %>% 
  as_tbl_time(index = AccrualDate)

sample <- sample %>% 
  mutate(
    RealInterest = 0,
    OpenInterest = cumsum(AiaAdjustAmt) - cumsum(RealInterest) - AiaAdjustAmt - RealInterest,
    RealInterest = OpenInterest*SellUnits/OpenUnits 
  )

这不会产生正确答案。

# A time tibble: 10 x 7
# Index: AccrualDate
   OpenUnits ClosingUnits AccrualDate AiaAdjustAmt SellUnits s24j_real s24j_open
       <dbl>        <dbl> <date>             <dbl>     <dbl>     <dbl>     <dbl>
 1  7500000.     7500000. 2016-04-01         1845.        0.        0.        0.
 2  7500000.     7500000. 2016-04-02         1845.        0.        0.     1845.
 3  7500000.     7500000. 2016-04-03         1846.        0.        0.     3690.
 4  7500000.     7500000. 2016-04-04         1846.        0.        0.     5536.
 5  7500000.     3300000. 2016-04-05          813.  4200000.     4134.     7382.
 6  3300000.     3300000. 2016-04-06          813.        0.        0.     8194.
 7  3300000.     3300000. 2016-04-07          813.        0.        0.     9007.
 8  3300000.     3300000. 2016-04-08          813.        0.        0.     9820.
 9  3300000.     3300000. 2016-04-09          813.        0.        0.    10633.
10  3300000.     3300000. 2016-04-10          814.        0.        0.    11446.

正确答案应如下所示。这是我用for循环实现的，我试图避免，因为它在嵌套的较大数据集上感觉很慢。

# A time tibble: 10 x 7
# Index: AccrualDate
   OpenUnits ClosingUnits AccrualDate AiaAdjustAmt SellUnits s24j_real s24j_open
       <dbl>        <dbl> <date>             <dbl>     <dbl>     <dbl>     <dbl>
 1  7500000.     7500000. 2016-04-01         1845.        0.        0.        0.
 2  7500000.     7500000. 2016-04-02         1845.        0.        0.     1845.
 3  7500000.     7500000. 2016-04-03         1846.        0.        0.     3690.
 4  7500000.     7500000. 2016-04-04         1846.        0.        0.     5536.
 5  7500000.     3300000. 2016-04-05          813.  4200000.     4134.     7382.
 6  3300000.     3300000. 2016-04-06          813.        0.        0.     4060.
 7  3300000.     3300000. 2016-04-07          813.        0.        0.     4873.
 8  3300000.     3300000. 2016-04-08          813.        0.        0.     5686.
 9  3300000.     3300000. 2016-04-09          813.        0.        0.     6499.
10  3300000.     3300000. 2016-04-10          814.        0.        0.     7313.

生成正确答案的代码。

sample2 <- sample %>% 
  mutate(
    sell_ratio = if_else(!is.na(SellUnits), SellUnits/OpenUnits, 0),
    s24j_open = 0,
    s24j_close = 0,
    s24j_real = 0     
  )

open <- 0
close <- 0  

for (i in seq_along(sample2$AccrualDate)) {

  open <- close
  sellratio <- sample2[i, ]$sell_ratio
  int <- sample2[i, ]$AiaAdjustAmt
  real <- sellratio*open

  close <- open - real + int

  sample2[i, ]$s24j_open <- open
  sample2[i, ]$s24j_real <- real
  sample2[i, ]$s24j_close <- close
}

sample2 %>% 
  select(
    OpenUnits, ClosingUnits, AccrualDate, AiaAdjustAmt, SellUnits, s24j_real, s24j_open
  )

Answer 1

迟到总比没有好：

## defining the data frame
sample <- data.frame(OpenUnits = c(7500000, 7500000, 7500000, 7500000, 7500000, 
                               3300000, 3300000, 3300000, 3300000, 3300000), 
                 ClosingUnits = c(7500000, 7500000, 7500000, 7500000, 3300000, 3300000, 3300000, 3300000, 3300000, 3300000), 
                 AccrualDate = 16892:16901, 
                 AiaAdjustAmt = c(1844.70359677349, 1845.18465061665, 1845.66582990696, 1846.14713467713, 812.516568582349, 
                                  812.728453146696, 812.940392965385, 813.152388052826, 813.364438423431, 813.576544091616), 
                 SellUnits = c(NA, NA, NA, NA, 4200000, NA, NA, NA, NA, NA))

## defining a function to deliver the final output
## warning: the function is recursive
myfct <- function(n){
  ratio <- (sample$SellUnits/sample$OpenUnits)[n]
  ratio <- ifelse(is.na(ratio), 0, ratio)
  if(n > 1){
    vec <- myfct(n-1)
    val <- vec[length(vec)]
    newval <- sample$AiaAdjustAmt[n]+val*(1-ratio)
    newvec <- c(vec, newval)
    return(newvec)
  }
  if(n == 1){return(sample$AiaAdjustAmt[n]+0*(1-ratio))}
}

## finally applying the function and binding the output
## depending on dim(sample)[1], here one might have to add something like:
## options(expressions=10000) (to avoid "Error: evaluation nested too deeply...")
close <- myfct(dim(sample)[1])
open <- c(0, close[1:(length(close)-1)])
real <- open*ifelse(is.na(sample$SellUnits/sample$OpenUnits),
                    0,
                    sample$SellUnits/sample$OpenUnits)
output <- cbind.data.frame(sample, real, open)

结果如下：

> output
   OpenUnits ClosingUnits AccrualDate AiaAdjustAmt SellUnits     real     open
1    7500000      7500000       16892    1844.7036        NA    0.000    0.000
2    7500000      7500000       16893    1845.1847        NA    0.000 1844.704
3    7500000      7500000       16894    1845.6658        NA    0.000 3689.888
4    7500000      7500000       16895    1846.1471        NA    0.000 5535.554
5    7500000      3300000       16896     812.5166   4200000 4133.753 7381.701
6    3300000      3300000       16897     812.7285        NA    0.000 4060.465
7    3300000      3300000       16898     812.9404        NA    0.000 4873.194
8    3300000      3300000       16899     813.1524        NA    0.000 5686.134
9    3300000      3300000       16900     813.3644        NA    0.000 6499.286
10   3300000      3300000       16901     813.5765        NA    0.000 7312.651

然而，递归函数不会提高性能，而相反（see this post）。虽然在上面的代码中myfct仅用于计算close（real，open是从中派生的）。 ~~无论如何，我相信代码可以被修改以便失去递归和/或sapply - 我会尝试并更新代码。~~

修改

我的代码的第一个版本的问题是使用递归函数和sapply使得该过程非常冗长。实际上，myfct(n)已经为所有myfct(k)计算了k<n，因此使用sapply计算这些值是多余且低效的。

在此，我保留旧的function和sapply以获得完整性（已在上面的代码中编辑过新的function）：

myfct.old <- function(n){ ratio <- (sample$SellUnits/sample$OpenUnits)[n] ratio <- ifelse(is.na(ratio), 0, ratio) if(n > 1){return(sample$AiaAdjustAmt[n]+myfct.old(n-1)*(1-ratio))} if(n == 1){return(sample$AiaAdjustAmt[n]+0*(1-ratio))} } ## finally applying the function and binding the output close <- sapply(1:dim(sample)[1], myfct)

最后，这是不同方法的性能比较：

## Increasing the size of the data frame sample <- do.call("rbind", replicate(100, sample, simplify = FALSE)) ## (1) New method start.time <- Sys.time() close <- myfct(dim(sample)[1]) open <- c(0, close[1:(length(close)-1)]) real <- open*ifelse(is.na(sample$SellUnits/sample$OpenUnits), 0, sample$SellUnits/sample$OpenUnits) output <- cbind.data.frame(sample, real, open) end.time <- Sys.time() time.taken1 <- end.time - start.time time.taken1 "Time difference of 0.19402 secs" ## (2) A loop start.time <- Sys.time() close <- 0 s24j_open <- c() s24j_real <- c() s24j_close <- c() for(k in 1:dim(sample)[1]){ open <- close ratio <- (sample$SellUnits/sample$OpenUnits)[k] ratio <- ifelse(is.na(ratio), 0, ratio) real <- open*ratio close <- sample$AiaAdjustAmt[k]+open-real s24j_open <- c(s24j_open, open) s24j_real <- c(s24j_real, real) s24j_close <- c(s24j_close, close) } output <- cbind.data.frame(sample, s24j_real, s24j_open) end.time <- Sys.time() time.taken2 <- end.time - start.time time.taken2 "Time difference of 0.3530352 secs" ## (3) Old method start.time <- Sys.time() close <- sapply(1:dim(sample)[1], myfct.old) open <- c(0, close[1:(length(close)-1)]) real <- open*ifelse(is.na(sample$SellUnits/sample$OpenUnits), 0, sample$SellUnits/sample$OpenUnits) output <- cbind.data.frame(sample, real, open) end.time <- Sys.time() time.taken3 <- end.time - start.time time.taken3 "Time difference of 48.86089 secs"

tibbletime - 前一天的收盘

1 个答案: