我想基于一列(下面数据中的B列)分离我的数据。换句话说,我喜欢用'C' C'在信件之前他们有相同号码的信件' c'与其他行(字母,a和b暗示父母,c意味着孩子,我喜欢从同一家庭中移除孩子)。
输入:
df <- structure(list(
A = c("L", "L","L","L","L","L","L", "L", "L"),
B = c("l-7b", "l-7a", "l-7c", NA, "l-15a", "l-15c", "l-16c", "l-20b", "l-20c"),
C = c(6002, 6003, 6006,6007,6010,6011,6015, 6087, 6098)),
.Names = c("A", "B", "C"),
row.names = c(NA, 9L),
class = "data.frame")}
输出:
A B C
1 L l-7b 6002
2 L l-7a 6003
4 L NA 6007
5 L l-15a 6010
7 L l-16c 6015
8 L l-20b 6087
答案 0 :(得分:1)
使用dplyr
,
library(dplyr)
#convert to character
df[1:2] <- lapply(df[1:2], as.character)
df %>%
group_by(new = substr(B, 1, nchar(B)-1)) %>%
mutate(freq = n()) %>%
subset(!grepl('c', B)|freq == 1) %>%
ungroup() %>%
select(A, B, C)
# A tibble: 6 × 3
# A B C
# <chr> <chr> <int>
#1 L l-7b 6002
#2 L l-7a 6003
#3 L <NA> 6007
#4 L l-15a 6010
#5 L l-16c 6015
#6 L l-20b 6087
答案 1 :(得分:1)
替代dplyr
+ tidyr
方法:
read.table(text="A B C
L l-7b 6002
L l-7a 6003
L l-7c 6006
L NA 6007
L l-15a 6010
L l-15c 6011
L l-16c 6015
L l-20b 6087
L l-20c 6098", header=TRUE, stringsAsFactors=FALSE) -> df
library(dplyr)
library(tidyr)
filter_kids <- function(x) {
if (nrow(x) > 1) x <- filter(x, code2 != "c")
}
extract(df, B, c("code1", "code2"), "([[:alpha:]]-[[:digit:]]+)([[:alpha:]])", remove=FALSE) %>%
group_by(code1) %>%
do(filter_kids(.)) %>%
select(A, B, C)
data.frame()
## code1 A B C
## 1 l-15 L l-15a 6010
## 2 l-16 L l-16c 6015
## 3 l-20 L l-20b 6087
## 4 l-7 L l-7b 6002
## 5 l-7 L l-7a 6003
## 6 <NA> L <NA> 6007
在最后添加了data.frame()
,以便按要求使数据结构变得笨拙。
答案 2 :(得分:1)
这是使用基数R的可能解决方案:
### recreate your example data frame
DF <- read.csv(text=
"A,B,C
1,L,l-7b,6002
2,L,l-7a,6003
3,L,l-7c,6006
4,L,NA,6007
5,L,l-15a,6010
6,L,l-15c,6011
7,L,l-16c,6015")
### create a new DF copy with extra columns useful for our subset
DFExt <- DF
# add a column containing the code before the leading chars "a","b","c" extracted from B
DFExt$BCode <- gsub("^(.+)[abc]$","\\1",DF$B)
# add a logical column indicating if B ends with "c"
DFExt$IsChild <- grepl("c$",DF$B)
# add a column indicating for each BCode how many parents we have
DFExt$NumParents <-
ave(1:nrow(DFExt),
DFExt$BCode,
FUN=function(idxs){
sum(!DFExt$IsChild[idxs])
})
### let's subset removing the rows where IsChild=TRUE and NumParents > 0
DFSubset <- DF[!(DFExt$IsChild & DFExt$NumParents > 0),]
> DFSubset
A B C
1 L l-7b 6002
2 L l-7a 6003
4 L <NA> 6007
5 L l-15a 6010
7 L l-16c 6015
答案 3 :(得分:1)
这是使用基本R的另一种方式:
A = 1:7
B = c("1-1a","1-1c","1-2c","1-3b","1-2b",NA,"1-4c")
df = data.frame(A,B)
> df
A B
1 1 1-1a
2 2 1-1c
3 3 1-2c
4 4 1-3b
5 5 1-2b
6 6 <NA>
7 7 1-4c
执行:
rowToRemove = NULL
# Find rows that have a "c"
rw = which(grepl("c",df$B))
for(k in rw){
# Get all characters that are not "c"
m = regexpr("[^c]*",df$B[k])
# Find how many of these are in the B column
z = regmatches(df$B[k],m)
t = sum(grepl(z,df$B))
# If more than 1, tag for removal
if(t>1) rowToRemove = c(rowToRemove,k)
}
df = df[-rowToRemove,]
结果:
> df
A B
1 1 1-1a
4 4 1-3b
5 5 1-2b
6 6 <NA>
7 7 1-4c
我确定这个for循环可以使用* apply函数进行矢量化,但我会把这个练习留给你,