Question

我想基于一列（下面数据中的B列）分离我的数据。换句话说，我喜欢用'C＆＃39; C＆＃39;在信件之前他们有相同号码的信件＆＃39; c＆＃39;与其他行（字母，a和b暗示父母，c意味着孩子，我喜欢从同一家庭中移除孩子）。

输入：

df <- structure(list(
 A = c("L", "L","L","L","L","L","L", "L", "L"), 
 B = c("l-7b", "l-7a", "l-7c", NA, "l-15a", "l-15c", "l-16c", "l-20b", "l-20c"), 
 C = c(6002, 6003, 6006,6007,6010,6011,6015, 6087, 6098)), 
 .Names = c("A", "B", "C"), 
 row.names = c(NA, 9L), 
 class = "data.frame")}

输出：

    A      B         C

1   L    l-7b      6002                
2   L    l-7a      6003                            
4   L       NA     6007                 
5   L    l-15a     6010
7   L    l-16c     6015
8   L    l-20b     6087

Answer 1

使用dplyr，

的想法

library(dplyr)

#convert to character
df[1:2] <- lapply(df[1:2], as.character)

df %>% 
 group_by(new = substr(B, 1, nchar(B)-1)) %>% 
 mutate(freq = n()) %>% 
 subset(!grepl('c', B)|freq == 1) %>% 
 ungroup() %>% 
 select(A, B, C)

# A tibble: 6 × 3
#      A     B     C
#  <chr> <chr> <int>
#1     L  l-7b  6002
#2     L  l-7a  6003
#3     L  <NA>  6007
#4     L l-15a  6010
#5     L l-16c  6015
#6     L l-20b  6087

Answer 2

替代dplyr + tidyr方法：

read.table(text="A      B         C
L    l-7b      6002
L    l-7a      6003
L    l-7c      6006
L       NA     6007
L    l-15a     6010
L    l-15c     6011
L    l-16c     6015
L    l-20b     6087
L    l-20c     6098", header=TRUE, stringsAsFactors=FALSE) -> df

library(dplyr)
library(tidyr)

filter_kids <- function(x) {
  if (nrow(x) > 1) x <- filter(x, code2 != "c")
}

extract(df, B, c("code1", "code2"), "([[:alpha:]]-[[:digit:]]+)([[:alpha:]])", remove=FALSE) %>%
  group_by(code1) %>%
  do(filter_kids(.)) %>%
  select(A, B, C)
  data.frame()
##   code1 A     B    C
## 1  l-15 L l-15a 6010
## 2  l-16 L l-16c 6015
## 3  l-20 L l-20b 6087
## 4   l-7 L  l-7b 6002
## 5   l-7 L  l-7a 6003
## 6  <NA> L  <NA> 6007

在最后添加了data.frame()，以便按要求使数据结构变得笨拙。

Answer 3

这是使用基数R的可能解决方案：

### recreate your example data frame
DF <- read.csv(text=
"A,B,C
1,L,l-7b,6002                
2,L,l-7a,6003                
3,L,l-7c,6006                
4,L,NA,6007                 
5,L,l-15a,6010
6,L,l-15c,6011
7,L,l-16c,6015")

### create a new DF copy with extra columns useful for our subset
DFExt <- DF
# add a column containing the code before the leading chars "a","b","c" extracted from B 
DFExt$BCode <- gsub("^(.+)[abc]$","\\1",DF$B) 
# add a logical column indicating if B ends with "c"
DFExt$IsChild <- grepl("c$",DF$B) 
# add a column indicating for each BCode how many parents we have
DFExt$NumParents <- 
ave(1:nrow(DFExt),
    DFExt$BCode,
    FUN=function(idxs){
      sum(!DFExt$IsChild[idxs])
    }) 

### let's subset removing the rows where IsChild=TRUE and NumParents > 0
DFSubset <- DF[!(DFExt$IsChild & DFExt$NumParents > 0),]

> DFSubset
  A     B    C
1 L  l-7b 6002
2 L  l-7a 6003
4 L  <NA> 6007
5 L l-15a 6010
7 L l-16c 6015

Answer 4

这是使用基本R的另一种方式：

A = 1:7
B = c("1-1a","1-1c","1-2c","1-3b","1-2b",NA,"1-4c")
df = data.frame(A,B)

> df
  A    B
1 1 1-1a
2 2 1-1c
3 3 1-2c
4 4 1-3b
5 5 1-2b
6 6 <NA>
7 7 1-4c

执行：

rowToRemove = NULL

# Find rows that have a "c"
rw = which(grepl("c",df$B))

for(k in rw){
    # Get all characters that are not "c"
    m = regexpr("[^c]*",df$B[k])

    # Find how many of these are in the B column
    z = regmatches(df$B[k],m)
    t = sum(grepl(z,df$B))

    # If more than 1, tag for removal
    if(t>1) rowToRemove = c(rowToRemove,k)
}

df = df[-rowToRemove,]

结果：

> df
  A    B
1 1 1-1a
4 4 1-3b
5 5 1-2b
6 6 <NA>
7 7 1-4c

我确定这个for循环可以使用* apply函数进行矢量化，但我会把这个练习留给你，

如何分离和删除具有某些特定值的行？

4 个答案: