library(plyr);
library(sqldf);
library(data.table)
library(stringi);
library(RODBC);
dbhandle <- odbcDriverConnect('driver={SQL Server};server=.;database=TEST_DB;trusted_connection=true')
res <- sqlQuery(dbhandle, 'Select Company_ID,
AsOfDate,
CashFlow FROM dbo.Accounts')
resdatatable = as.data.table(res)
odbcCloseAll();
sppv <- function(i, n) {
return((1 + i / 100) ^ (-n))
}
npv <- function(x, i) {
npv = c()
for (k in 1:length(i)) {
pvs = x * sppv(i[k], 1:length(x))
npv = c(npv, sum(pvs))
}
return(npv)
}
xirr <- function(cashflow, dates) {
if (length(cashflow) != length(dates)) {
stop("length(cashflow) != length(dates)")
}
cashflow_adj <- c(cashflow[1])
for (i in 1:(length(cashflow) - 1)) {
d1 <- as.Date(dates[i], "%d-%m-%Y", origin = "1970-01-01")
d2 <- as.Date(dates[i + 1], "%d-%m-%Y", origin = "1970-01-01")
# There are no checks about the monotone values of dates
# put a check in here if the interval is negative
interval <- as.integer(d2 - d1)
if (length(interval) > 0 && !is.na(interval)) {
cashflow_adj <- c(cashflow_adj, rep(0, interval - 1), cashflow[i + 1])
}
}
left = -10
right = 10
epsilon = 1e-8
while (abs(right - left) > 2 * epsilon) {
midpoint = (right + left) / 2
if (npv(cashflow_adj, left) * npv(cashflow_adj, midpoint) > 0) {
left = midpoint
} else {
right = midpoint
}
}
irr = (right + left) / 2 / 100
irr <- irr * 365
# Annualized yield (return) reflecting compounding effect of daily returns
irr <- (1 + irr / 365) ^ 365 - 1
irr
}
groupedCompanyNames <- unique(as.character(resdatatable$Company_ID));
groupedDatesPerCompany <- split(resdatatable$AsOfDate, resdatatable$Company_ID);
groupedCashFlowsPerCompany <- split(resdatatable$CashFlow, resdatatable$Company_ID);
resultsDataFrame <- data.table(Company_ID = character(length(groupedCompanyNames)), XIRR = numeric(length(groupedCompanyNames)));
datalist = result <- vector("list", length(groupedCompanyNames));
for (i in groupedCompanyNames) {
datesForCompany <- groupedDatesPerCompany[i];
dates <- datesForCompany[[i]];
cashFlowsForCompany <- groupedCashFlowsPerCompany[i];
cashFlows <- cashFlowsForCompany[[i]];
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
newRow <- data.frame(Company_ID = i, XIRR = format(round(xirrResult, 2), nsmall = 2));
datalist[[i]] <- newRow;
}
resultsDataFrame <- data.table::rbindlist(datalist)
finalDataFrame <- as.data.frame(resultsDataFrame);
print(finalDataFrame);
因此,为了提供上下文,我尝试执行以下操作:
以下是我正在使用的源数据示例
Company_ID CashFlow AsOfDate
3F68D729-D69D-E711-9C98-5065F34B3E7D 368608.0000 2004-11-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 366999.0000 2004-12-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 326174.0000 2005-01-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 345666.0000 2005-02-28 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -1529180.0000 2005-03-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D -65259.0000 2005-04-30 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 514005.0000 2005-05-31 00:00:00.000
3F68D729-D69D-E711-9C98-5065F34B3E7D 512951.0000 2005-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-06-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-07-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-08-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-09-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2011-10-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6792.0000 2011-11-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6791.0000 2011-12-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -187375.0000 2012-01-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -215902.0000 2012-02-29 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -6572.0000 2012-03-31 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -217409.0000 2012-04-30 00:00:00.000
9B64D729-D69D-E711-9C98-5065F34B3E7D -191830.0000 2012-05-31 00:00:00.000
我是R的新手 - 大约有2000个独特的公司名称,平均50个日期,每个现金流组合= 100000个记录,循环需要大约28秒来处理。
我已经看过使用asParallel库并使用了foreach,但这似乎没有对速度产生任何影响。如果我取出函数xirr的调用,则循环被处理并立即完成。
xirr需要异常处理,因为有时无法迭代计算xirr值。
我知道循环并不是R中的最佳实践 - 有关如何将其矢量化以获得更好性能的任何建议吗?
答案 0 :(得分:0)
为了提高性能,我使用了doParallel库。
library(doParallel)
cl <- makeCluster(detectCores() - 1, type = 'PSOCK')
registerDoParallel(cl)
而不是for循环,我将逻辑放入foreach
resultsDataFrame <- foreach(n = 1:length(groupedCompanyNames), .combine = rbind) %dopar% {
company_id <- groupedCompanyNames[n];
datesForCompany <- groupedDatesPerCompany[n];
dates <- unsplit(datesForCompany, company_id);
cashFlowsForCompany <- groupedCashFlowsPerCompany[n];
cashFlows <- unsplit(cashFlowsForCompany, company_id);
#now calculate the xirr for the values
xirrResult <- tryCatch(xirr(cashFlows, dates),
error = function(e) {
0
});
data.frame(Company_ID = company_id, XIRR = format(round(xirrResult, 2), nsmall = 2));
}
registerDoSEQ();
当我将完整的数据集(4000家公司)与他们的日期和现金流一起运行时。原始循环总共需要40万行,大约需要10分钟。使用foreach循环并利用机器中的额外核心,操作需要60秒。
我希望有人能够在此基础上建议进一步的表现高峰,但我认为这是一个很好的改进。