根据第二帧中的值获取帧中的列

时间:2017-12-08 11:24:40

标签: r dplyr

我有2个数据帧。一个有一个ID列,其中有很多ID。 另一个只有第一列的特定行。那些是我的标记。 我需要根据第二列的id值得到特定列中值的总和。 第一列可能是

id   goals   cards   group
1      2       2       1
2      3       2       1
3      4       2       1
4      5       1       1      
5      1       2       1
1      2       2       2
2      3       2       2
3      4       2       2
4      5       1       3      
5      1       2       3

第二个:

 id   goals   cards   group
  2      3       2      1
  5      1       2      1
  2      3       2      2
  3      4       2      2
  5      1       2      3

我需要得到的东西:

 id   goals   cards   group   points
 1      2       2       1      2-(2+2)
 2      3       2       1      0 cause in second list
 3      4       2       1      4-(2+1+2)
 4      5       1       1      5-(1+2)
 5      1       2       1      0 cause in second list
 1      2       2       2      2-(2+2)
 2      3       2       2      0
 3      4       2       2      0
 4      5       1       3      5-(1+2)  
 5      1       2       3      0

类似于:??

df1<- df1%>%
rowwise() %>% 
mutate(points= 
       goals
       -(sum( df1$cards[df1$id <= df2$id & df1$id>df1$id])))

1 个答案:

答案 0 :(得分:1)

df1 = read.table(text = "
id   goals   cards
1      2       2
2      3       2
3      4       2
4      5       1
5      1       2
", header=T)

df2 = read.table(text = "
id   goals   cards  
2      3       2
5      1       2
", header=T)

library(dplyr)

# function that gets an id and returns the sum of cards based on df2
GetSumOfCards = function(x) {
  ids = min(df2$id[df2$id >= x])                        # for a given id of df1 find the minimum id in df2 that is bigger than this id 
  ifelse(x %in% df2$id,                                 # if the given id exists in df2
         0,                                             # sum of cards is zero
         sum(df1$cards[df1$id >= x & df1$id <= ids]))   # otherwise get sum of cards in df1 from this id until the id obtained before                  
}

# update function to be vectorised
GetSumOfCards = Vectorize(GetSumOfCards)


df1 %>%
  mutate(sum_cards = GetSumOfCards(id),      # get sum of cards for each id using the function
         points = goals - sum_cards)         # get the points

#   id goals cards sum_cards points
# 1  1     2     2         4     -2
# 2  2     3     2         0      3
# 3  3     4     2         5     -1
# 4  4     5     1         3      2
# 5  5     1     2         0      1

根据您更新的问题,对每一行应用类似的功能会使进程变得非常慢。因此,此解决方案以一种方式对数据进行分组,您可以在数据/行的块上计算卡片数:

df1 = read.table(text = "
id   goals   cards   group
                 1      2       2       1
                 2      3       2       1
                 3      4       2       1
                 4      5       1       1      
                 5      1       2       1
                 1      2       2       2
                 2      3       2       2
                 3      4       2       2
                 4      5       1       3      
                 5      1       2       3
                 ", header=T)

df2 = read.table(text = "
                 id   goals   cards   group
                 2      3       2      1
                 5      1       2      1
                 2      3       2      2
                 3      4       2      2
                 5      1       2      3
                 ", header=T)

library(dplyr)

df1 %>%
  arrange(group, desc(id)) %>%                                  # order by group and id descending (this will help with counting the cards)
  left_join(df2 %>%                                             # join specific columns of df2 and add a flag to know that this row exists in df2
              select(id, group) %>%
              mutate(flag = 1), by=c("id","group")) %>%
  mutate(flag = ifelse(is.na(flag), 0, flag),                   # replace NA with 0
         flag2 = cumsum(flag)) %>%                              # this flag will create the groups we need to count cards 
  group_by(group, flag2) %>%                                    # for each new group (we need both as the card counting will change when we have a row from df2, or if group changes)
  mutate(sum_cards = ifelse(flag == 1, 0, cumsum(cards))) %>%   # get cummulative sum of cards unless the flag = 1, where we need 0 cards
  ungroup() %>%                                                 # forget the grouping
  arrange(group, id) %>%                                        # back to original order
  mutate(points = goals - sum_cards) %>%                        # calculate points
  select(-flag, -flag2)                                         # remove flags

# # A tibble: 10 x 6
#      id goals cards group sum_cards points
#   <int> <int> <int> <int>     <dbl>  <dbl>
# 1     1     2     2     1         4     -2
# 2     2     3     2     1         0      3
# 3     3     4     2     1         5     -1
# 4     4     5     1     1         3      2
# 5     5     1     2     1         0      1
# 6     1     2     2     2         4     -2
# 7     2     3     2     2         0      3
# 8     3     4     2     2         0      4
# 9     4     5     1     3         3      2
# 10    5     1     2     3         0      1