Question

DUMMY DATA SET :(与我的数据集不同的是，在我的情况下，item_code是字符串）

Promise.all

我在R中有一个简单的函数，我所做的就是：

in_cluster <- data.frame(item_code = c(1:500))
in_cluster$cluster <-
        sample(5, size = nrow(in_cluster), replace = TRUE)
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
real_sales <-
    data.frame(
            item_code = sample(500, size = 100000, replace = TRUE),
            sales = sample(500, size = 100000, replace = TRUE)
    )

mean_trajectory <- data.frame(sales = c(1:52))
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
training_df <- data.frame(
        LTF_t_minus_1 = numeric(0),
        LTF_t = numeric(0),
        LTF_t_plus_1 = numeric(0),
        RS_t_minus_1 = numeric(0),
        RS_t = numeric(0),
        STF_t_plus_1 = numeric(0)
)
training_df[nrow(training_df) + 1, ] <-
        c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0

week = 2

我是R的新手，发现这真的很奇怪，看看数据实际上有多小，但循环数据帧需要多长时间（system.time({ for (r in 1:nrow(in_cluster)) { item <- in_cluster[r,] sale_row <- dplyr::filter(real_sales, item_code == item$item_code) if (nrow(sale_row) > 2) { new_df <- data.frame( LTF_t_minus_1 = mean_trajectory$sales[[week - 1]], LTF_t = mean_trajectory$sales[[week]], LTF_t_plus_1 = mean_trajectory$sales[[week + 1]], RS_t_minus_1 = sale_row$sales[[week - 1]], RS_t = sale_row$sales[[week]], STF_t_plus_1 = sale_row$sales[[week + 1]] ) training_df <- bind_rows(training_df, new_df) } } })循环500行）。

EDIT_IMPORTANT：但是对于上面给出的虚拟数据集，只需421.59 seconds即可获得输出＆gt;这可能是因为item_code有字符串？是否需要花费大量时间来处理字符串item_code。（我没有将字符串用于虚拟数据集，因为我不知道如何在1.10 seconds中为item_code提供500个唯一字符串，并且在in_cluster中具有与{{1}相同的字符串}}）

我阅读了其他一些文章，这些文章提出了优化R代码的方法并使用real_sales代替item_code或使用：

bind_rows

使用bind_rows似乎在循环500行数据帧rbind

时将性能提高了36秒

在这种情况下是否可以使用lapply？我尝试了下面的代码并得到了一个错误：

filter_impl（.data，dots）出错：$运算符无效原子载体

training_df[nrow(training_df) + 1,] <-
    c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])

对in_cluster的帮助将不胜感激，但我的主要目标是加快循环。

Answer 1

好的，你的代码中有很多不好的做法。

您正在按行操作
您每行创建2（！）个新数据帧（非常昂贵）
你正在循环中增长对象）training_df <- bind_rows(training_df, new_df)在每次迭代中不断增长，同时运行相当昂贵的操作（bind_rows））
当你只能运行一次时，你一遍又一遍地运行相同的操作（为什么你在mean_trajectory$sales[[week-1]]和mean_trajectory与循环无关的时候运行data.table？你可以之后再分配。）
这个名单继续......

我建议使用一种替代的简单in_cluster解决方案，它会表现得更好。我们的想法是首先在real_sales和mean_trajectory之间建立二进制连接（并在连接时运行所有操作，而不创建额外的数据帧然后绑定它们）。然后，仅运行一次所有training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19)相关行。（我忽略了rbind初始化，因为它在此处无关紧要，您可以稍后使用和library(data.table) #v1.10.4 ## First step res <- setDT(real_sales)[setDT(in_cluster), # binary join if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do RS_t = sales[week], # by condition STF_t_plus_1 = sales[week + 1]), on = "item_code", # The join key by = .EACHI] # Do the operations per each join ## Second step (run the `mean_trajectory` only once) res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1], LTF_t = mean_trajectory$sales[week], LTF_t_plus_1 = mean_trajectory$sales[week + 1])]添加

### Creating your data sets
set.seed(123)
N <- 1e5
N2 <- 5e7

in_cluster <- data.frame(item_code = c(1:N))

real_sales <-
  data.frame(
    item_code = sample(N, size = N2, replace = TRUE),
    sales = sample(N, size = N2, replace = TRUE)
  )

mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))

training_df <- data.frame(
  LTF_t_minus_1 = numeric(0),
  LTF_t = numeric(0),
  LTF_t_plus_1 = numeric(0),
  RS_t_minus_1 = numeric(0),
  RS_t = numeric(0),
  STF_t_plus_1 = numeric(0)
)
week = 2

###############################
################# Your solution
system.time({
  for (r in 1:nrow(in_cluster)) {
    item <- in_cluster[r,, drop = FALSE]
    sale_row <-
      dplyr::filter(real_sales, item_code == item$item_code)
    if (nrow(sale_row) > 2) {
      new_df <- data.frame(
        LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
        LTF_t = mean_trajectory$sales[[week]],
        LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
        RS_t_minus_1 = sale_row$sales[[week - 1]],
        RS_t = sale_row$sales[[week]],
        STF_t_plus_1 = sale_row$sales[[week + 1]]
      )
      training_df <-
        bind_rows(training_df, new_df)
    }
  }
}) 
### Ran forever- I've killed it after half an hour


######################
########## My solution
library(data.table)
system.time({
res <-
  setDT(real_sales)[setDT(in_cluster), 
                  if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
                               RS_t = sales[week],
                               STF_t_plus_1 = sales[week + 1]), 
                  on = "item_code",
                  by = .EACHI]
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
           LTF_t = mean_trajectory$sales[week],
           LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
})

# user  system elapsed 
# 2.42    0.05    2.47

一些基准：

data.table

因此，对于50MM行，<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android" xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent" android:layout_height="match_parent" tools:context="com.mediamer.metwally.saidkotb.activities.makalatActivity" android:layoutDirection="rtl" android:orientation="vertical" > <TabHost android:id="@android:id/tabhost" android:layout_width="match_parent" android:layout_height="match_parent" android:background="#FF0000" > <LinearLayout android:layout_width="match_parent" android:layout_height="match_parent" android:orientation="vertical" > <TabWidget android:id="@android:id/tabs" android:layout_width="match_parent" android:layout_height="wrap_content" > </TabWidget> <FrameLayout android:id="@android:id/tabcontent" android:layout_width="match_parent" android:layout_height="match_parent" > </FrameLayout> </LinearLayout> </TabHost> </LinearLayout>解决方案运行了大约2秒，而您的解决方案无休止地运行直到我将其杀死（半小时后）。

优化R中的for循环

1 个答案: