如何按组创建顺序计数,以排除上面各行中的值

时间:2018-10-11 02:05:18

标签: r

我有这样的df:

entry_id <- c(222,222,222,222,222,223,223,223,223,224,224,224,224,224,224,224)
id_1 <- c(2,4,3,5,1,3,1,4,2,6,3,7,2,1,9,5)
id_2 <- c(1,3,5,2,8,2,7,3,1,2,4,9,5,3,2,8)
df <- data.frame(entry_id,id_1,id_2)

对于每个entry_id,我要创建一个连续计数的id_1值,这些值不会出现在上面的id_2行中。如果id_1值确实出现在id_2中(对于同一entry_id),那么我想用NA标记它。我的示例数据结果如下:

df$result <- c(1,2,NA,NA,NA,1,2,3,NA,1,2,3,NA,4,NA,NA)

我这样做的尝试看起来像这样。首先,我添加一个row_index

df$row_index <- seq.int(nrow(df))

然后,我尝试用字符串标记要计数的变量,并用NA标记不想计数的变量。不幸的是,这行不通。

df$result <- apply(df,1,function(x) ifelse(x["id_1"] %in% x["id_2"][1:x["row_index"] - 1],NA,"count_this"))

如果我可以使以上代码正常工作,那么我接下来要做的就是这样:

df <- transform(df,result = ave(result, entry_id, FUN = function(x) cumsum(!is.na(x))))

执行此操作的最佳方法是什么?

3 个答案:

答案 0 :(得分:0)

使用/* Following the specification in the README.md file, provide your * SymbolBalance class. * test these: { }’s, ( )'s, [ ]'s, " "’s, and /* * /’s are properly balanced */ import java.io.File; import java.io.FileNotFoundException; import java.util.Scanner; public class SymbolBalance{ public static void main(String[] args){ if(args.length > 0){ try{ Scanner file = new Scanner(new File(args[0])); MyStack<Character> balance = new MyStack<>(); String string; char character; char charNext; int line = 0; boolean beginning = true; // the easiest way to understand/code this problem is by // reading over each individual string, then each // individual character of that string while(file.hasNextLine()){ line++; string = file.next(); for(int i = 0; i < string.length() - 1; i++){ character = string.charAt(i); charNext = string.charAt(i + 1); if(character == '[' || character == '{' || character == '(' || character == '/' && charNext == '*' || character == '/' && charNext == '*'){ balance.push(character); } else if(character == '*' && charNext == '/'){ if(balance.isEmpty()){ System.out.println("<"+i+">: Empty"); } else if(balance.pop() != '*'){ System.out.println("<"+i+">: <"+character+">, <"+balance.pop()+">"); } } else if(character == ']'){ if(balance.isEmpty()){ System.out.println("<"+i+">: Empty"); } else if(balance.pop() != '['){ System.out.println("<"+i+">: <"+character+">, <"+balance.pop()+">"); } } else if(character == '}'){ if(balance.isEmpty()){ System.out.println("<"+i+">: Empty"); } else if(balance.pop() != '{'){ System.out.println("<"+i+">: <"+character+">, <"+balance.pop()+">"); } } else if(character == ')'){ if(balance.isEmpty()){ System.out.println("<"+i+">: Empty"); } else if(balance.pop() != '('){ System.out.println("<"+i+">: <"+character+">, <"+balance.pop()+">"); } } else if(character == '"'){ if(beginning == true){ balance.push(character); } else{ if(balance.isEmpty()){ System.out.println("<"+i+">: Empty"); } else if(balance.pop() != '('){ System.out.println("<"+i+">: <"+character+">, <"+balance.pop()+">"); } beginning = true; } } } } file.close(); } catch(FileNotFoundException e){ System.out.println("No such file exists or cannot be found"); } } } }

dplyr

说明,让我们看看最后一组:

df %>%
  group_by(entry_id) %>%
  mutate(
    m = match(id_1, id_2),
    m = (is.na(m) | m >= row_number()),
    r = if_else(m, cumsum(m), NA_integer_)
  ) %>%
  ungroup() %>%
  select(-m)
# # A tibble: 16 x 4
#    entry_id  id_1  id_2     r
#       <dbl> <dbl> <dbl> <int>
#  1      222     2     1     1
#  2      222     4     3     2
#  3      222     3     5    NA
#  4      222     5     2    NA
#  5      222     1     8    NA
#  6      223     3     2     1
#  7      223     1     7     2
#  8      223     4     3     3
#  9      223     2     1    NA
# 10      224     6     2     1
# 11      224     3     4     2
# 12      224     7     9     3
# 13      224     2     5    NA
# 14      224     1     3     4
# 15      224     9     2    NA
# 16      224     5     8    NA

这将返回匹配的第一个索引,x <- df[10:16,] match(x$id_1, x$id_2) # [1] NA 5 NA 1 NA 3 4 (如果找不到)。如果NA则根本找不到,因此应计算在内。如果一个数大于或等于此向量中的位置,则它首先会在以后出现,因此应进行计数。如果数字小于矢量中的位置,则它应该为NA

从那里,我创建条件NA(临时),它指示应该计算的内容。

m

从这里开始,df %>% group_by(entry_id) %>% mutate( m = match(id_1, id_2), m = (is.na(m) | m >= row_number()), r = if_else(m, cumsum(m), NA_integer_) ) %>% ungroup() # # A tibble: 16 x 5 # entry_id id_1 id_2 m r # <dbl> <dbl> <dbl> <lgl> <int> # 1 222 2 1 TRUE 1 # 2 222 4 3 TRUE 2 # 3 222 3 5 FALSE NA # 4 222 5 2 FALSE NA # 5 222 1 8 FALSE NA # 6 223 3 2 TRUE 1 # 7 223 1 7 TRUE 2 # 8 223 4 3 TRUE 3 # 9 223 2 1 FALSE NA # 10 224 6 2 TRUE 1 # 11 224 3 4 TRUE 2 # 12 224 7 9 TRUE 3 # 13 224 2 5 FALSE NA # 14 224 1 3 TRUE 4 # 15 224 9 2 FALSE NA # 16 224 5 8 FALSE NA 保持计数。

答案 1 :(得分:0)

可以肯定的是,我已经使这一过程复杂化了,但是使用基数R的一种方法

df$result1 <- unlist(lapply(split(df, df$entry_id), function(x) {
        temp = sapply(1:nrow(x), function(y) !x[y, "id_1"] %in% x[1:y, "id_2"])
        ifelse(temp, cumsum(temp), NA)
}))

df
#   entry_id id_1 id_2 result result1
#1       222    2    1      1       1
#2       222    4    3      2       2
#3       222    3    5     NA      NA
#4       222    5    2     NA      NA
#5       222    1    8     NA      NA
#6       223    3    2      1       1
#7       223    1    7      2       2
#8       223    4    3      3       3
#9       223    2    1     NA      NA
#10      224    6    2      1       1
#11      224    3    4      2       2
#12      224    7    9      3       3
#13      224    2    5     NA      NA
#14      224    1    3      4       4
#15      224    9    2     NA      NA
#16      224    5    8     NA      NA

我们以split entry_id的数据帧,因此每个entry_id都有一个单独的数据帧。然后,对于每个数据帧,我们遍历每一行,并检查上方各行的id_1值中是否存在该行的id_2值。如果在id_1中找不到id_2的值,我们将使用cumsum递增计数器,否则只需返回NA

答案 2 :(得分:0)

您可以定义一个函数,然后使用split。某些循环可能无法避免。

entry_id <- c(222,222,222,222,222,223,223,223,223,224,224,224,224,224,224,224)
id_1 <- c(2,4,3,5,1,3,1,4,2,6,3,7,2,1,9,5)
id_2 <- c(1,3,5,2,8,2,7,3,1,2,4,9,5,3,2,8)
df <- data.frame(entry_id,id_1,id_2)
df$result <- c(1,2,NA,NA,NA,1,2,3,NA,1,2,3,NA,4,NA,NA)

my_check <- function(a, b) {
  flag <- rep(1, length(a))
  res  <- rep(0, length(a))
  for (i in seq_along(a)) {
    if (a[i] %in% b[1:max(1, i-1)]) {
      flag[i] <- 0
      res[i] <- NA
    } else {
      res[i] <- cumsum(flag)[i]
    }    
  }
  return(res)
}

df$result2 <- unlist(lapply(split(df[, c("id_1", "id_2")], df$entry_id),
                            function(x) my_check(x[[1]], x[[2]])))
df


#   entry_id id_1 id_2 result result2
#1       222    2    1      1       1
#2       222    4    3      2       2
#3       222    3    5     NA      NA
#4       222    5    2     NA      NA
#5       222    1    8     NA      NA
#6       223    3    2      1       1
#7       223    1    7      2       2
#8       223    4    3      3       3
#9       223    2    1     NA      NA
#10      224    6    2      1       1
#11      224    3    4      2       2
#12      224    7    9      3       3
#13      224    2    5     NA      NA
#14      224    1    3      4       4
#15      224    9    2     NA      NA
#16      224    5    8     NA      NA