以下两个数据框是我当前正在使用的数据的摘录。 df1包含投资者的历史记录(用id分隔)以及他们拥有的不同产品的股票数量。每当股份数量发生变化时,都会创建一个新条目。 df2包含与产品相对应的价格。
我正在尝试计算所有客户在投资期间的每月投资组合价值。
这里是df1和df2的可重现示例:
library(dplyr)
library(lubridate)
library(timeDate)
#create df1 customer portfolio history
id <- c("1","1","1","1","2","2","2","3","3","3","3","3")
df1 <- data.frame(id)
df1$start <- as.Date(c("2012-03-11", "2012-04-17","2012-05-09", "2012-05-11", "2012-11-17","2012-12-09",
"2013-01-21", "2011-06-27","2012-07-02", "2012-07-21", "2012-09-03","2012-09-16"))
df1$end <- as.Date(c("2012-05-08", "2012-05-21","2012-06-11", "2012-11-16", "2012-12-08","2013-01-20",
"2013-02-03", "2011-07-01","2012-09-15", "2012-09-02", "2012-09-20","2012-09-16"))
df1$product <- c("a","b","a","b","b","b","b","c","c","a","a","c")
df1$amount <- as.numeric(c("5","12","7","11","3","8","6","4","1","16","17","9"))
#create df2 with corresponding Prices
date <- seq.Date(from = as.Date("2011-05-01"), to = as.Date("2013-02-01"), by = "month")
df2 <- data.frame(date)
df2$product <- "a"
date <- seq.Date(from = as.Date("2012-04-01"), to = as.Date("2013-02-01"), by = "month")
date <- data.frame(date)
date$product <- "b"
df2 <- rbind(df2,date)
date <- seq.Date(from = as.Date("2011-06-01"), to = as.Date("2012-09-01"), by = "month")
date <- data.frame(date)
date$product <- "c"
df2 <- rbind(df2,date)
df2$price <- as.numeric(sample(100, size = nrow(df2), replace = TRUE))
df2$date <- as.Date(timeLastDayInMonth(df2$date))
我最终要做的是将我的投资者数据散布成多种格式,并在每个月末人工添加一行日期。然后,我对价格数据进行相同的处理,将两者一起加入,并最终使用rowSums计算投资组合值。
这是我上面数据框的代码:
#convert to wide data
df1 <- df1 %>%
spread(product, amount, fill = NA, convert = FALSE)
colnames(df1)[4:6] <- paste("xxx", colnames(df1[,c(4:6)]), sep = "_")
#add end of month observations to data frame
seq <- df1 %>%
group_by(id) %>%
summarize(start= floor_date(AddMonths(min(start),-1), "month"),end=max(end)) %>%
group_by(rn=row_number()) %>%
do(data.frame(id=.$id, datum=seq(.$start,.$end,by="1 month"))) %>%
ungroup() %>%
select(-rn)
seq <- seq %>%
group_by(id) %>%
mutate(start = as.Date(timeLastDayInMonth(datum))) %>%
ungroup() %>%
select(-2)
df1 <- full_join(df1,seq, by = c("id","start"))
df1 <- df1[with(df1, order(id, start)),]
#create grouping variable and filter all end of month data
df1<- df1 %>%
group_by(id) %>%
mutate(grp = as.numeric(as.Date(start)- as.Date(timeLastDayInMonth(start))))
df1 <- df1 %>%
group_by(id) %>%
fill(4:6, .direction = "down")
df1 <- filter(df1, grp == 0)
na_sub <- function(x) { x[is.na(x)] <- 0; x }
df1 <- df1 %>%
select(-end, -grp) %>%
na_sub()
#Join both wide dataframes and calculate monthly portfoliovalues
df2 <- df2 %>%
spread(product, price, fill = NA, convert = FALSE)
colnames(df2)[2:4] <- paste("yyy", colnames(df2[,c(2:4)]), sep = "_")
names(df2)[names(df2) == "date"] <- "start"
df1 <- left_join(df1, df2, by = "start")
df1$portfoliovalue <- rowSums(select(df1, starts_with("xxx_")) * select(df1, starts_with("yyy_")), na.rm = TRUE)
该代码以每个投资者的每月投资组合值得出期望的结果。如前所述,这是整个数据的一小段。不幸的是,我遇到了麻烦,特别是在宽数据帧的大小上(由于产品数量的增加,它们得到了大量的列)。这使得无法使用较大的数据集运行代码。是否可以将数据以长格式保存以进行计算?有提供此类计算程序的软件包吗?
答案 0 :(得分:0)
也许软件包PMwR
中的某些功能可以提供帮助。 (公开:我是软件包的维护者。)使用PMwR
,您可以创建日记帐(即流,职位变动),然后计算职位。例如:
library("PMwR")
library("datetimeutils")
j <- journal(account = rep(df1$id, 2),
instrument = rep(df1$product, 2),
amount = c(df1$amount, -df1$amount),
timestamp = c(df1$start, df1$end))
month.ends <- nth_day(start = as.Date("2012-01-01"),
end = as.Date("2012-12-31"),
n = "last", period = "month")
position(j, when = month.ends, use.account = TRUE)
## 1:a 1:b 2:b 3:a 3:c
## 2012-01-31 0 0 0 0 0
## 2012-02-29 0 0 0 0 0
## 2012-03-31 5 0 0 0 0
## 2012-04-30 5 12 0 0 0
## 2012-05-31 7 11 0 0 0
## 2012-06-30 0 11 0 0 0
## 2012-07-31 0 11 0 16 1
## 2012-08-31 0 11 0 16 1
## 2012-09-30 0 11 0 0 0
## 2012-10-31 0 11 0 0 0
## 2012-11-30 0 0 3 0 0
## 2012-12-31 0 0 3 0 0
更多详细信息在manual中。
更新当您使用position
参数调用when
时,您将
为when
中的每个值获得一个位置。一个简单的方法
添加id
就是遍历ID,将
每个id
的位置到数据框中,然后合并这些数据
框架。 (一个人不能直接呼叫rbind
,因为
每个id
的产品可能有所不同。)
accounts <- unique(j$account)
for (a in accounts)
if (a == accounts[1]) {
result <- data.frame(timestamp = month.ends,
id = a,
position(j[j$account == a],
when = month.ends))
} else {
result <- merge(result,
data.frame(timestamp = month.ends,
id = a,
position(j[j$account == a],
when = month.ends)),
all = TRUE)
}
result[is.na(result)] <- 0
result
## timestamp id a b c
## 1 2012-01-31 1 0 0 0
## 2 2012-01-31 2 0 0 0
## 3 2012-01-31 3 0 0 0
## 4 2012-02-29 1 0 0 0
## 5 2012-02-29 2 0 0 0
## 6 2012-02-29 3 0 0 0
## 7 2012-03-31 1 5 0 0
## 8 2012-03-31 2 0 0 0
## 9 2012-03-31 3 0 0 0
## 10 2012-04-30 1 5 12 0
## 11 2012-04-30 2 0 0 0
## 12 2012-04-30 3 0 0 0
## 13 2012-05-31 1 7 11 0
## 14 2012-05-31 2 0 0 0
## 15 2012-05-31 3 0 0 0
## 16 2012-06-30 1 0 11 0
## 17 2012-06-30 2 0 0 0
## 18 2012-06-30 3 0 0 0
## 19 2012-07-31 1 0 11 0
## 20 2012-07-31 2 0 0 0
## 21 2012-07-31 3 16 0 1
## 22 2012-08-31 1 0 11 0
## 23 2012-08-31 2 0 0 0
## 24 2012-08-31 3 16 0 1
## 25 2012-09-30 1 0 11 0
## 26 2012-09-30 2 0 0 0
## 27 2012-09-30 3 0 0 0
## 28 2012-10-31 1 0 11 0
## 29 2012-10-31 2 0 0 0
## 30 2012-10-31 3 0 0 0
## 31 2012-11-30 1 0 0 0
## 32 2012-11-30 2 0 3 0
## 33 2012-11-30 3 0 0 0
## 34 2012-12-31 1 0 0 0
## 35 2012-12-31 2 0 8 0
## 36 2012-12-31 3 0 0 0