通过函数调用改善循环性能

时间:2017-11-22 12:21:36

标签: r performance loops data-science xirr

library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);

dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
       AsOfDate,
       CashFlow FROM dbo.Accounts')

resdatatable = as.data.table(res)

odbcCloseAll();


sppv <- function(i, n) {
    return((1 + i / 100) ^ (-n))
}


npv <- function(x, i) {
    npv = c()
    for (k in 1:length(i)) {
        pvs = x * sppv(i[k], 1:length(x))
        npv = c(npv, sum(pvs))
    }
    return(npv)
}


xirr <- function(cashflow, dates) {
    if (length(cashflow) != length(dates)) {
        stop("length(cashflow) != length(dates)")
    }

    cashflow_adj <- c(cashflow[1])
    for (i in 1:(length(cashflow) - 1)) {
        d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
        d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")

        # There are no checks about the monotone values of dates
        # put a check in here if the interval is negative

        interval <- as.integer(d2 - d1)

        if (length(interval) > 0 && !is.na(interval)) {
            cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
        }
   }

    left = -10
    right = 10
    epsilon = 1e-8
    while (abs(right - left) > 2 * epsilon) {
        midpoint = (right + left) / 2
        if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
            left = midpoint
        } else {
            right = midpoint
        }
    }


    irr = (right + left) / 2 / 100
    irr <- irr * 365
    # Annualized yield (return) reflecting compounding effect of daily returns
    irr <- (1 + irr / 365) ^ 365 - 1

    irr
}




groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));




groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);




groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);


resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));



datalist = result <- vector("list", length(groupedCompanyNames));



for (i in groupedCompanyNames) {


    datesForCompany <- groupedDatesPerCompany[i];
    dates <- datesForCompany[[i]];



    cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
    cashFlows <- cashFlowsForCompany[[i]];


    xirrResult <- tryCatch(xirr(cashFlows, dates),
                           error = function(e) {

                              0
                           });

    newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
    datalist[[i]] <- newRow;

}

resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);

print(finalDataFrame);

因此,为了提供上下文,我尝试执行以下操作:

  1. 使用RODBC连接从数据库中获取数据
  2. 获取唯一的公司名称
  3. 拆分每家公司的现金流量和日期
  4. 使用已知行数初始化数据表,以便它不需要 逐渐增长。
  5. 在列表中循环显示唯一的公司名称和调用函数get xirr 公司的现金流量和日期。
  6. 将每行包含公司名称和XIRR值添加到新数据表中。
  7. 使用rbindlist。
  8. 以下是我正在使用的源数据示例

    Company_ID  CashFlow    AsOfDate
    3F68D729-D69D-E711-9C98-5065F34B3E7D    368608.0000 2004-11-30 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    366999.0000 2004-12-31 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    326174.0000 2005-01-31 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    345666.0000 2005-02-28 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    -1529180.0000   2005-03-31 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    -65259.0000 2005-04-30 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    514005.0000 2005-05-31 00:00:00.000
    3F68D729-D69D-E711-9C98-5065F34B3E7D    512951.0000 2005-06-30 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-06-30 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-07-31 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6572.0000  2011-08-31 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-09-30 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6572.0000  2011-10-31 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6792.0000  2011-11-30 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6791.0000  2011-12-31 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -187375.0000    2012-01-31 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -215902.0000    2012-02-29 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -6572.0000  2012-03-31 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -217409.0000    2012-04-30 00:00:00.000
    9B64D729-D69D-E711-9C98-5065F34B3E7D    -191830.0000    2012-05-31 00:00:00.000
    

    我是R的新手 - 大约有2000个独特的公司名称,平均50个日期,每个现金流组合= 100000个记录,循环需要大约28秒来处理。

    我已经看过使用asParallel库并使用了foreach,但这似乎没有对速度产生任何影响。如果我取出函数xirr的调用,则循环被处理并立即完成。

    xirr需要异常处理,因为有时无法迭代计算xirr值。

    我知道循环并不是R中的最佳实践 - 有关如何将其矢量化以获得更好性能的任何建议吗?

1 个答案:

答案 0 :(得分:0)

为了提高性能,我使用了doParallel库。

library(doParallel)
cl <- makeCluster(detectCores() - 1, type = 'PSOCK')
registerDoParallel(cl)

而不是for循环,我将逻辑放入foreach

resultsDataFrame <- foreach(n = 1:length(groupedCompanyNames), .combine = rbind) %dopar% {


    company_id <- groupedCompanyNames[n];
    datesForCompany <- groupedDatesPerCompany[n];
    dates <- unsplit(datesForCompany, company_id);


    cashFlowsForCompany <- groupedCashFlowsPerCompany[n];
    cashFlows <- unsplit(cashFlowsForCompany, company_id);

    #now calculate the xirr for the values
    xirrResult <- tryCatch(xirr(cashFlows, dates),
    error = function(e) {

    0
    });



    data.frame(Company_ID = company_id, XIRR = format(round(xirrResult, 2), nsmall = 2));
}

registerDoSEQ();

当我将完整的数据集(4000家公司)与他们的日期和现金流一起运行时。原始循环总共需要40万行,大约需要10分钟。使用foreach循环并利用机器中的额外核心,操作需要60秒。

我希望有人能够在此基础上建议进一步的表现高峰,但我认为这是一个很好的改进。