Question

我有一个时间序列数据集，比如一个非常简化的版本，时间和价格列。

Time    Price
15:30:01    NA
15:30:02    NA
15:30:03    36
15:30:04    38
15:30:05    37.5
15:30:06    NA
15:30:07    NA
15:30:08    37
15:30:09    37.8
15:30:10    39
15:30:11    40
15:30:12    38.5
15:30:13    38
15:30:14    38

我希望编写一个能够返回最佳价格的函数：

Time    Price   Best Price
15:30:01    NA  36
15:30:02    NA  36
15:30:03    36  36
15:30:04    38  38
15:30:05    37.5    38
15:30:06    NA  38
15:30:07    NA  38
15:30:08    37  38
15:30:09    37.8    38
15:30:10    39  39
15:30:11    40  40
15:30:12    38.5    40
15:30:13    38  40
15:30:14    38  40

我试过

bbo <- function(price1, price2) {
  currbestprice <- price2
  newbestprice <- ifelse(price1 >= currbestprice, price1, currbestprice)
  currbestprice <- newbestprice
  return(currbestprice)
}

我将通过na.omit（Price）[1]启动我的price2以获得第一个非NA值。然后我希望currbestprice不断更新，以保持最新的最优价格。 Price1只是价格系列。

但是当我测试时：

p1 <- c(NA,NA,36,38,37.5,NA,NA,37,37.8,39,40,38.5,38,38)
p2 <- 36

bbo(p1,p2)返回

NA   NA 36.0 38.0 37.5   NA   NA 37.0 37.8 39.0 40.0 38.5 38.0 38.0

它似乎没有更新我的currbestprice。我被困了，我会感激任何帮助。

Answer 1

另一个带有cummax的基本R选项 - 函数：

# create a new column 'BestPrice'
df$BestPrice <- df$Price

# replace the first NA with the first non-NA value
df$BestPrice[is.na(df$BestPrice)][1] <- df$BestPrice[!is.na(df$BestPrice)][1]

# relace the remaining NA's with zero
df$BestPrice[is.na(df$BestPrice)] <- 0

# use 'cummax' to replace the values with the best price untill that point
df$BestPrice <- cummax(df$BestPrice)

给出：

> df
       Time Price BestPrice
1  15:30:01    NA        36
2  15:30:02    NA        36
3  15:30:03  36.0        36
4  15:30:04  38.0        38
5  15:30:05  37.5        38
6  15:30:06    NA        38
7  15:30:07    NA        38
8  15:30:08  37.0        38
9  15:30:09  37.8        38
10 15:30:10  39.0        39
11 15:30:11  40.0        40
12 15:30:12  38.5        40
13 15:30:13  38.0        40
14 15:30:14  38.0        40

另一种选择是将na.locf与fromLast = TRUE - zoo - 参数结合cummax结合使用{/ 1}}：

library(zoo)
df$BestPrice <- na.locf(df$Price, fromLast = TRUE)
df$BestPrice <- cummax(df$BestPrice)

Answer 2

你可以试试这个：

# simulate some data
data = data.frame(
  time = 1:100,
  price = rpois(1:100, lambda = 10)
);

# add some random NA values to it, as in your data
na_idxs = sample(x = 1:100, size = 30);
data[na_idxs, 2] = NA;

# initialise a value we'll update storing the max price
c_max = 0;

# the vector containing the best prices
best_price = list();

# for every price
for(i in 1:length(x = data$price)){

  # if the current price is NA
  if(is.na(x = data$price[i])){
    # don't update c_max
  } 
  # if the current value exceeds c_max
  else if(data$price[i] > c_max){
    # update c_max to this value
    c_max = data$price[i];
  }
  # given the above is mutually exclusive, the current price must be less than c_max
  # so don't update
  else {
    # don't update c_max
  }

  # add c_max to best_price
  best_price[[i]] = c_max;
}

# add best_price to data by adding the column to the end
data = cbind(data, best_price = unlist(x = best_price));

# output the data

    time price best_price
1      1     8          8
2      2    13         13
3      3     7         13
4      4     7         13
5      5     6         13
6      6    14         14
7      7     6         14
8      8    NA         14
9      9     7         14
10    10    NA         14
11    11     9         14
12    12    11         14
13    13    16         16
14    14    14         16
15    15    14         16
16    16    NA         16
17    17     4         16
18    18    11         16

Answer 3

你可以尝试这个（假设NA为0）

df <- read.table(text = "
           15:30:01    NA
           15:30:02    NA
           15:30:03    36
           15:30:04    38
           15:30:05    37.5
           15:30:06    NA
           15:30:07    NA
           15:30:08    37
           15:30:09    37.8
           15:30:10    39
           15:30:11    40
           15:30:12    38.5
           15:30:13    38
           15:30:14    38")

colnames(df) <- c("Time", "Price")

df[which(is.na(df$Price)),"Price"] <- 0
Bestprice = df[which(df$Price > 0)[1],"Price"]

for(i in 1:nrow(df)){

  if(Bestprice < df$Price[i]){
    df$BestPrice[i] <- df$Price[i]
    Bestprice <- df$Price[i]
  } else{
    df$BestPrice[i] <- Bestprice
  }
}

> df
Time Price BestPrice
1  15:30:01   0.0        36
2  15:30:02   0.0        36
3  15:30:03  36.0        36
4  15:30:04  38.0        38
5  15:30:05  37.5        38
6  15:30:06   0.0        38
7  15:30:07   0.0        38
8  15:30:08  37.0        38
9  15:30:09  37.8        38
10 15:30:10  39.0        39
11 15:30:11  40.0        40
12 15:30:12  38.5        40
13 15:30:13  38.0        40
14 15:30:14  38.0        40

通过函数更新数据框以查找连续的最佳值

3 个答案: