有条件地删除基于另一个id代码的行

时间:2015-02-09 00:39:01

标签: r dataframe

在包含许多ID的数据集中,我只是试图操纵id为7或9的行,并保持其他所有内容不受影响。

我试图在没有与之对应的变量的所有实例中有条件地从7或9中删除一行。所以,如果在下面的dput示例的情况下,我想从id = 9中删除第九行,因为id = 7没有itemcode = 2。反之亦然id = 7,我试图删除它的itemcode = 9,因为id = 9没有它。

id  client    item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
...
7   7     Bob  eighth        8  100    13    18    15    NA    NA    NA    NA
8   7     Bob   ninth        9  100    11    21    10    NA    NA    NA    NA
9   9 Bob_new   first        1  100    NA    NA    NA    23    18    25    18

代码:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", 
"Bob_new", "Mark"), class = "factor"), item = structure(c(3L, 
9L, 4L, 2L, 8L, 7L, 1L, 5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L
), .Label = c("eighth", "fifth", "first", "fourth", "ninth", 
"second", "seventh", "sixth", "third"), class = "factor"), itemcode = c(1L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L
), unit = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = structure(c(5L, 
6L, 1L, 4L, 2L, 5L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("11", "12", "13", "22", "24", "25", "NA"), class = "factor"), 
X2002 = structure(c(4L, 8L, 1L, 3L, 7L, 2L, 5L, 6L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("13", "14", "15", 
"17", "18", "21", "22", "24", "NA"), class = "factor"), X2003 = structure(c(5L, 
1L, 4L, 2L, 6L, 1L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L), .Label = c("10", "11", "15", "19", "23", "24", "NA"), class = "factor"), 
X2004 = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 4L, 
2L, 6L, 1L, 3L, 4L, 3L, 4L), .Label = c("11", "14", "15", 
"20", "23", "25", "NA"), class = "factor"), X2005 = structure(c(6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 3L, 1L, 4L, 
3L), .Label = c("11", "13", "18", "19", "25", "NA"), class = "factor"), 
X2006 = structure(c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 6L, 
1L, 2L, 5L, 3L, 7L, 8L, 4L), .Label = c("10", "15", "18", 
"19", "20", "22", "23", "25", "NA"), class = "factor"), X2007 =     structure(c(8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 7L, 6L, 2L, 4L, 1L, 5L, 5L, 
3L), .Label = c("12", "13", "16", "18", "19", "21", "24", 
"NA"), class = "factor")), .Names = c("id", "client", "item", 
"itemcode", "unit", "X2001", "X2002", "X2003", "X2004", "X2005", 
"X2006", "X2007"), class = "data.frame", row.names = c(NA, -17L
))

---------------------------------------- 另一个情景:

之前:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"), 
item = structure(c(3L, 9L, 10L, 9L, 4L, 2L, 8L, 7L, 7L, 1L, 
5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L), .Label = c("eighth", 
"fifth", "first", "fourth", "ninth", "second", "seventh", 
"sixth", "third", "third "), class = "factor"), itemcode = c(1L, 
3L, 3L, 3L, 4L, 5L, 6L, 7L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 1L), type = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B", "C"), class = "factor"), unit = c(100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L, 
25L, 30L, 26L, 11L, 22L, 12L, 25L, 24L, 13L, 11L, NA, NA, 
NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 24L, 12L, 96L, 
13L, 15L, 22L, 21L, 14L, 18L, 21L, NA, NA, NA, NA, NA, NA, 
NA, NA, NA), X2003 = c(23L, 10L, 46L, 94L, 19L, 11L, 24L, 
19L, 10L, 15L, 10L, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
X2004 = c(NA, NA, 43L, 83L, NA, NA, NA, 6L, NA, NA, NA, 23L, 
20L, 14L, 25L, 11L, 15L, 20L, 15L, 20L), X2005 = c(NA, NA, 
97L, 86L, NA, NA, NA, 17L, NA, NA, NA, 18L, 13L, 19L, 18L, 
25L, 18L, 11L, 19L, 18L), X2006 = c(NA, NA, 11L, 91L, NA, 
NA, NA, 11L, NA, NA, NA, 25L, 22L, 10L, 15L, 20L, 18L, 23L, 
25L, 19L), X2007 = c(NA, NA, 19L, 27L, NA, NA, NA, 15L, NA, 
NA, NA, 18L, 24L, 21L, 13L, 18L, 12L, 19L, 19L, 16L)), .Names = c("id", 
"client", "item", "itemcode", "type", "unit", "X2001", "X2002", 
"X2003", "X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA, 
-20L))

后:

structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"), 
item = structure(c(2L, 7L, 3L, 1L, 5L, 4L, 2L, 6L, 3L, 1L, 
5L, 4L, 2L), .Label = c("fifth", "first", "fourth", "seventh", 
"sixth", "third", "third "), class = "factor"), itemcode = c(1L, 
3L, 4L, 5L, 6L, 7L, 1L, 3L, 4L, 5L, 6L, 7L, 1L), type = structure(c(1L, 
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B"), class = "factor"), unit = c(100L, 100L, 100L, 100L, 
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L, 
10L, 11L, 22L, 12L, 17L, NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 
87L, 13L, 15L, 22L, 19L, NA, NA, NA, NA, NA, NA, NA), X2003 = c(23L, 
47L, 19L, 11L, 24L, 17L, NA, NA, NA, NA, NA, NA, NA), X2004 = c(NA, 
28L, NA, NA, NA, 28L, 23L, 14L, 25L, 11L, 15L, 20L, 20L), 
X2005 = c(NA, 43L, NA, NA, NA, 16L, 18L, 19L, 18L, 25L, 18L, 
11L, 18L), X2006 = c(NA, 69L, NA, NA, NA, 5L, 25L, 10L, 15L, 
20L, 18L, 23L, 19L), X2007 = c(NA, 72L, NA, NA, NA, 20L, 
18L, 21L, 13L, 18L, 12L, 19L, 16L)), .Names = c("id", "client", 
"item", "itemcode", "type", "unit", "X2001", "X2002", "X2003", 
"X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA, 
-13L))

我可以实现上述过滤器代码来删除相应位置(id 7和9)中不存在的项目。

但是如果项目有子级别,例如项目类型。我也试图删除项目,如果他们在相应的字段中没有类似的类型。

4 个答案:

答案 0 :(得分:1)

如果我理解了问题: id = 9子集中的每个itemcode必须在 id = 7子集中具有相同的itemcode(并且反向)。如果不是这种情况,那么我们使用非配对的商品代码过滤掉行,但保留id不在7或9中的所有内容。这是一种方法:

首先获取常用商品代码:

items_9 <- df_all$itemcode[ df_all$id==9 ] 
items_7 <- df_all$itemcode[ df_all$id==7 ] 
items_common <- items_9[ items_9 %in% items_7 ]

使用7和9的常用项目代码选择所有内容,其余部分:

df_new <- df_all[ 
  which( 
    ( df_all$id %in% c(7, 9) & 
      df_all$itemcode %in% items_common
    ) |
    !df_all$id %in% c(7,9)
  )
,]

答案 1 :(得分:1)

您可以使用filter

中的dplyr
library(dplyr)
filter(df_all, itemcode %in% intersect(itemcode[id==7], 
              itemcode[id==9])|!id %in% c(7,9) )
#    id  client    item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
#1   7     Bob   first        1  100    24    17    23    NA    NA    NA    NA
#2   7     Bob   third        3  100    25    24    10    NA    NA    NA    NA
#3   7     Bob  fourth        4  100    11    13    19    NA    NA    NA    NA
#4   7     Bob   fifth        5  100    22    15    11    NA    NA    NA    NA
#5   7     Bob   sixth        6  100    12    22    24    NA    NA    NA    NA
#6   7     Bob seventh        7  100    24    14    10    NA    NA    NA    NA
#7   7     Bob  eighth        8  100    13    18    15    NA    NA    NA    NA
#8   9 Bob_new   first        1  100    NA    NA    NA    23    18    25    18
#9   9 Bob_new   third        3  100    NA    NA    NA    14    19    10    21
#10  9 Bob_new  fourth        4  100    NA    NA    NA    25    18    15    13
#11  9 Bob_new   fifth        5  100    NA    NA    NA    11    25    20    18
#12  9 Bob_new   sixth        6  100    NA    NA    NA    15    18    18    12
#13  9 Bob_new seventh        7  100    NA    NA    NA    20    11    23    19
#14  9 Bob_new  eighth        8  100    NA    NA    NA    15    19    25    19
#15 10    Mark   first        1  100    NA    NA    NA    20    18    19    16

更新

基于新数据集,或许这有助于

library(dplyr)
library(tidyr)
dfnew %>%
      unite(itemtype, itemcode,type) %>% 
      filter(itemtype %in% intersect(itemtype[id==7], 
             itemtype[id==9])|!id %in% c(7,9)) %>% 
      separate(itemtype, c('itemcode', 'type'))
#    id  client    item itemcode type unit X2001 X2002 X2003 X2004 X2005 X2006
# 1   7     Bob   first        1    A  100    24    17    23    NA    NA    NA
# 2   7     Bob  third         3    B  100    30    12    46    43    97    11
# 3   7     Bob  fourth        4    A  100    11    13    19    NA    NA    NA
# 4   7     Bob   fifth        5    A  100    22    15    11    NA    NA    NA
# 5   7     Bob   sixth        6    A  100    12    22    24    NA    NA    NA
# 6   7     Bob seventh        7    A  100    25    21    19     6    17    11
# 7   9 Bob_new   first        1    A  100    NA    NA    NA    23    18    25
# 8   9 Bob_new   third        3    B  100    NA    NA    NA    14    19    10
# 9   9 Bob_new  fourth        4    A  100    NA    NA    NA    25    18    15
# 10  9 Bob_new   fifth        5    A  100    NA    NA    NA    11    25    20
# 11  9 Bob_new   sixth        6    A  100    NA    NA    NA    15    18    18
# 12  9 Bob_new seventh        7    A  100    NA    NA    NA    20    11    23
# 13 10    Mark   first        1    A  100    NA    NA    NA    20    18    19
 #   X2007
 #1     NA
 #2     19
 #3     NA
 #4     NA
 #5     NA
 #6     15
 #7     18
 #8     21
 #9     13
 #10    18
 #11    12
 #12    19
 #13    16

答案 2 :(得分:1)

library(dplyr)
df$remove <- paste(df$itemcode, df$type)
df<-invisible(filter(df,
                          remove %in% intersect(remove[type==7], 
                                                remove[type==9])|!type %in%     c(7,9) ))
#Remove the additional column after filter
df$remove <- NULL

答案 3 :(得分:0)

您可以执行以下操作:在两个方向上运行setdiffcl()函数并不是必需的,但我真的不喜欢一遍又一遍地写同一个表达式。

f <- function(x, y) setdiff(union(x, y), x)
cl <- function(var) substitute(df$itemcode[df$id == x], list(x = var))

现在,您可以在f()上致电c(id7, id9),然后将其反转并获得c(id9, id7)结果。

do.call(f, x <- list(cl(7), cl(9)))
# [1] 2
do.call(f, rev(x))
# [1] 9