一个数据帧和按键分组的数据在另一个数据帧中

时间:2014-12-22 07:58:24

标签: r

我是R.的新手。我想将数据汇总为

Id  Symbol
T1  MSFT
T2  MSFT
T3  AAPL
T4  GOOG
T5  AAPL


   Date     T1      T2      T3      T4      T5
12/3/2014   100     150     100     150     100
12/2/2014   200     200     300     250     330
12/1/2014   300     250     400     250     420
11/28/2014  200     650     150     500     150

我想要的输出是

   Date     MSFT        AAPL        GOOG
12/3/2014   100+150     100+100     150
12/2/2014   200+200     300+330     250
12/1/2014   300+250     400+420     250
11/28/2014  200+650     150+150     500

更新

df2 <- structure(list(Date = structure(c(6L, 5L, 4L, 3L, 2L, 1L), .Label = c("11/26/2014", 
"11/27/2014", "11/28/2014", "12/1/2014", "12/2/2014", "12/3/2014"
), class = "factor"), T2 = c(482544, 475968, 470304, 468000, 
466752, 460656), T1 = c(457878.7879, 455656.5657, 452424.2424, 
447171.7172, 445252.5253, 439595.9596), T3 = c(8155612.90284, 
8066709.67752, 7982838.70956, 7926924.73092, 7901763.441, 7799440.86012
)), .Names = c("Date", "T2", "T1", "T3"), row.names = c(NA, 6L
), class = "data.frame")

        Date     T2       T1      T3
1  12/3/2014 482544 457878.8 8155613
2  12/2/2014 475968 455656.6 8066710
3  12/1/2014 470304 452424.2 7982839
4 11/28/2014 468000 447171.7 7926925
5 11/27/2014 466752 445252.5 7901763
6 11/26/2014 460656 439596.0 7799441

df1 <- structure(list(Id = structure(1:3, .Label = c("T1", "T2", 
"T3"), class = "factor"), Fund = structure(c(1L, 1L, 1L), .Label = "F1", class = "factor"), 
    Symbol = structure(c(3L, 1L, 2L), .Label = c("AAPL", 
    "GOOG", "MSFT"), class = "factor"), Quantity= c(5000L, 4800L, 
    7800L)), .Names = c("Id", "Fund",  "Symbol", 
"Quantity"), row.names = c(NA, 3L), class = "data.frame")

        Id Fund Symbol Quantity
1       T1     F1     MSFT     5000
2       T2     F1     AAPL     4800
3       T3     F1     GOOG     7800

2 个答案:

答案 0 :(得分:3)

你可以尝试

df3 <- df2[1]
lst <- lapply(split(df1$Id,df1$Symbol), function (x) rowSums(df2[x]))
df3[names(lst)]  <- lst
df3
#       Date AAPL GOOG MSFT
#1  12/3/2014  200  150  250
#2  12/2/2014  630  250  400
#3  12/1/2014  820  250  550
#4 11/28/2014  300  500  850

或者

 lst <-  split(df1$Id, df1$Symbol)
 df3[names(lst)] <- Map(function(x,y) rowSums(x[y]),list(df2),lst)

更新

基于更新的数据集

 df3 <- df2[1]
 lst <-  split(as.character(df1$TaxlotId), df1$Symbol)
 lst1 <- lapply(lst, function(x) rowSums(df2[x]))
 df3[names(lst1)] <- lst1
 df3
 #    Date   AAPL    GOOG     MSFT
 #1  12/3/2014 482544 8155613 457878.8
 #2  12/2/2014 475968 8066710 455656.6
 #3  12/1/2014 470304 7982839 452424.2
 #4 11/28/2014 468000 7926925 447171.7
 #5 11/27/2014 466752 7901763 445252.5
 #6 11/26/2014 460656 7799441 439596.0

或者

  df3[names(lst)] <- Map(function(x,y) rowSums(x[y]),list(df2),lst)

另一个选项(对于大数据集)是将long形式更改为gather,然后使用wide重新转换为spread(与@David相同) Arenburg的)

library(dplyr)
library(tidyr)
left_join(gather(df2, Var, Val, -Date), df1[,c('Id', 'Symbol')],
              by=c('Var'='Id')) %>%
                             select(-Var) %>% 
                             spread(Symbol, Val)

 #        Date   AAPL    GOOG     MSFT
 #1 11/26/2014 460656 7799441 439596.0
 #2 11/27/2014 466752 7901763 445252.5
 #3 11/28/2014 468000 7926925 447171.7
 #4  12/1/2014 470304 7982839 452424.2
 #5  12/2/2014 475968 8066710 455656.6
 #6  12/3/2014 482544 8155613 457878.8

数据

df1 <- structure(list(Id = c("T1", "T2", "T3", "T4", "T5"), Symbol = 
c("MSFT", "MSFT", "AAPL", "GOOG", "AAPL")), .Names = c("Id", "Symbol"),
class = "data.frame", row.names = c(NA, -5L))

df2 <- structure(list(Date = c("12/3/2014", "12/2/2014", "12/1/2014", 
 "11/28/2014"), T1 = c(100L, 200L, 300L, 200L), T2 = c(150L, 200L, 
250L, 650L), T3 = c(100L, 300L, 400L, 150L), T4 = c(150L, 250L, 
250L, 500L), T5 = c(100L, 330L, 420L, 150L)), .Names = c("Date", 
"T1", "T2", "T3", "T4", "T5"), class = "data.frame", row.names = c(NA, 
-4L))

答案 1 :(得分:2)

这是一个使用data.table包的选项,而#34;熔化&#34;和&#34;铸造&#34;数据集。这种方法对于大型连接应该非常有效,因为它不使用循环(df1df2,如@akruns中所述)

library(data.table)
df3 <- setkey(melt(setDT(df2), "Date"), variable)
dcast.data.table(df3[df1], Date ~ Symbol, sum, value.var = "value")
#          Date   AAPL    GOOG     MSFT
# 1: 11/26/2014 460656 7799441 439596.0
# 2: 11/27/2014 466752 7901763 445252.5
# 3: 11/28/2014 468000 7926925 447171.7
# 4:  12/1/2014 470304 7982839 452424.2
# 5:  12/2/2014 475968 8066710 455656.6
# 6:  12/3/2014 482544 8155613 457878.8