Question

我有一个有趣的过滤器问题。对于每个TEI，我需要检查它是否存在于任何CHILDREN_LIST中，如果它确实删除了它存在的父行。

例如：TEI 611100存在于TEIL 611000的CHILDREN_LIST中，因此我需要删除611000行。

这是表的dput（）。谢谢！

structure(list(TEI = c(611000L, 611100L, 238000L, 452000L, 561000L, 
    621000L, 622000L, 622100L, 623000L, 722000L, 722500L, 722510L
    ), OWNERSHIP = c(30L, 30L, 50L, 50L, 50L, 50L, 50L, 50L, 50L, 
    50L, 50L, 50L), RESULT = c(266.9, 259.5, 138, 103.3, 105.8, 130, 
    230, 214.1, 171.9, 204, 185.2, 185.2), CODE = c(3L, 4L, 3L, 3L, 
    3L, 3L, 3L, 4L, 3L, 3L, 4L, 5L), CHILDREN_LIST = structure(c(4L, 
    NA, 1L, 2L, 3L, 5L, 6L, NA, 7L, 8L, 9L, 10L), .Label = c("238100 238200 238300 238900", 
    "452100 452900", "561100 561200 561300 561400 561500 561600 561700 561900", 
    "611100 611200", "621100 621200 621300 621400 621500 621600 621900", 
    "622100 622200 622300", "623100 623200 623300 623900", "722300 722400 722500", 
    "722510", "722511 722513 722514 722515"), class = "factor"), 
        ESTIMATE_TYPE = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
        TRUE, TRUE, TRUE, TRUE, TRUE), NAICS_LABEL = c(611, 6111, 
        238, 452, 561, 621, 622, 6221, 623, 722, 7225, 72251), NAICS_TITLE = structure(c(3L, 
        4L, 11L, 7L, 1L, 2L, 8L, 6L, 9L, 5L, 10L, 10L), .Label = c("Administrative and support services", 
        "Ambulatory health care services", "Educational services", 
        "Elementary and secondary schools", "Food services and drinking places", 
        "General medical and surgical hospitals", "General merchandise stores", 
        "Hospitals", "Nursing and residential care facilities", "Restaurants", 
        "Specialty trade contractors"), class = "factor")), .Names = c("TEI", 
    "OWNERSHIP", "RESULT", "CODE", "CHILDREN_LIST", "ESTIMATE_TYPE", 
    "NAICS_LABEL", "NAICS_TITLE"), row.names = c(NA, 12L), class = "data.frame")

Answer 1

library(dplyr)

#Construct a numeric list of children nodes for each row
child_list <- df$CHILDREN_LIST %>% as.character %>% strsplit("\\W+") %>% sapply(as.numeric)

#Test whether a TEI has a child
has_child <- sapply(child_list, function(ch) {
    any(ch %in% df$TEI)
})

subset(df, !has_child)

Answer 2

假设任何CHILDREN_LIST，您的意思是该特定行列表中的任何元素。这就是我做的。我知道在R中使用for循环并不受欢迎，但在这里它使代码更清晰。

which_rows_to_delete<-vector()

for ( i in 1:length(a)){
  #first create a vector of all the TEI in the children list 
  children<-unlist(strsplit(as.character(factor(a$CHILDREN_LIST[i])), split=" "))
  #check if the TEI of the row matches any element of the vector
  check<-any(a$TEI[i]==children)&!is.na(a$CHILDREN_LIST[i])
  #store that information in another vector
  which_rows_to_delete[i]<-check
}
a<-a[!check,]

假设任何CHILDREN_LIST，您的意思是该特定行列表中的任何元素。如果没有，您需要查看它是否与CHILDREN_LIST列中任何条目中的任何元素匹配，而不是上述代码中的children，请使用：

children_all<-unlist(strsplit(levels(a$CHILDREN_LIST), split=" "))

您提供的输入没有任何此类重叠，因此此数据框的输出是相同的。但是这个代码应该可以正常工作。：）

如果子行存在，则过滤掉父行

2 个答案: