我有这样的数据
df <- structure(list(string = structure(c(6L, 12L, 8L, 7L, 2L, 1L,
6L, 12L, 9L, 5L, 11L, 6L, 10L, 3L, 4L, 4L), .Label = c("CGSKDNIKHVPGGGSVQIVYKPVDLSK",
"ESPLQTPTEDGSEEPGSETSDAK", "HVPGGGSVQIVYKPVDLSKVTSK", "KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK",
"QEFEVMEDHAGTYGLGDR", "SKDGTGSDDKK", "SPSSAKSRLQTAPVPMPDLKNVK",
"SRLQTAPVPMPDLK", "SRLQTAPVPMPDLKNVKSK", "SRLQTAPVPMPDLKNVKSKIGSTENLK",
"STPTAEDVTAPLVDEGAPGK", "VQIINKKLDLSNVQSK"), class = "factor"),
key = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L), .Label = c("Mys: G52: ru1", "Mys: G52: ru2",
"Mys: G52: ru3"), class = "factor"), val = structure(c(3L,
15L, 2L, 11L, 9L, 5L, 13L, 6L, 1L, 7L, 8L, 16L, 12L, 4L,
10L, 14L), .Label = c("1442983324", "1451319531", "1512864.443",
"1612410048", "16349475.63", "1784901841", "30553282.01",
"317403612.9", "3612004.547", "3686081.063", "39135868.44",
"43701608", "64223793.8", "64959501.42", "775987137.8", "9767666215"
), class = "factor")), .Names = c("string", "key", "val"), class = "data.frame", row.names = c(NA,
-16L))
我试图仅保留基于第二列重复2次或更多次的那些。
例如,在上述数据中我们只能保留以下内容
SKDGTGSDDKK is in 3 of them (ru1, ru2 and ru3)
VQIINKKLDLSNVQSK is in 2 of them (ru1, ru2)
其余的只是一次基于关键
所以输出将是
string key val
SKDGTGSDDKK Mys: G52: ru1 1512864.443
SKDGTGSDDKK Mys: G52: ru2 64223793.8
SKDGTGSDDKK Mys: G52: ru3 9767666215
VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
VQIINKKLDLSNVQSK Mys: G52: ru2 1784901841
答案 0 :(得分:1)
您可以从基于string
和key
TAB = table(df$string, df$key) > 0
Repeated = rownames(TAB)[rowSums(TAB) > 1]
df[df$string %in% Repeated, ]
string key val
1 SKDGTGSDDKK Mys: G52: ru1 1512864.443
2 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
7 SKDGTGSDDKK Mys: G52: ru2 64223793.8
8 VQIINKKLDLSNVQSK Mys: G52: ru2 1784901841
12 SKDGTGSDDKK Mys: G52: ru3 9767666215
答案 1 :(得分:1)
我将从基础R解决方案开始,使用length
+ unique
组合确保每keys
个&gt; = string
:
# Split by string
lst <- split(df, df$string);
# Select list entries with >= 2 unique keys
sel <- sapply(lst, function(x) length(unique(x$key))) >= 2;
# Filter entries based on sel and convert to dataframe
df.sel <- do.call(rbind.data.frame, lst[sel]);
df.sel;
# string key val
#SKDGTGSDDKK.1 SKDGTGSDDKK Mys: G52: ru1 1512864.443
#SKDGTGSDDKK.7 SKDGTGSDDKK Mys: G52: ru2 64223793.8
#SKDGTGSDDKK.12 SKDGTGSDDKK Mys: G52: ru3 9767666215
#VQIINKKLDLSNVQSK.2 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
#VQIINKKLDLSNVQSK.8 VQIINKKLDLSNVQSK Mys: G52: ru2 1784901841
# Order by string then val
# Note: val is a factor so convert to numeric with as.numeric(as.character(...)
df.sel[order(df.sel$string, as.numeric(as.character(df.sel$val))), ];
# string key val
#SKDGTGSDDKK.1 SKDGTGSDDKK Mys: G52: ru1 1512864.443
#SKDGTGSDDKK.7 SKDGTGSDDKK Mys: G52: ru2 64223793.8
#SKDGTGSDDKK.12 SKDGTGSDDKK Mys: G52: ru3 9767666215
#VQIINKKLDLSNVQSK.2 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
#VQIINKKLDLSNVQSK.8 VQIINKKLDLSNVQSK Mys: G52: ru2 1784901841
df2
# Split by string
lst <- split(df2, df2$string);
# Select list entries with >= 2 unique keys
sel <- sapply(lst, function(x) length(unique(x$key))) >= 2;
# Filter entries based on sel, convert to dataframe,
# and order by string then numeric val
df2.sel <- do.call(rbind.data.frame, lst[sel]);
options(digits = 9);
df2.sel$val <- as.numeric(as.character(df2.sel$val));
df2.sel <- df2.sel[order(df2.sel$string, df2.sel$val), ];
df2.sel;
#SKDGTGSDDKK.1 SKDGTGSDDKK Mys: G52: ru1
#SKDGTGSDDKK.2 SKDGTGSDDKK Mys: G52: ru2
#SKDGTGSDDKK.3 SKDGTGSDDKK Mys: G52: ru3
#SRLQTAPVPMPDLKNVKSK.12 SRLQTAPVPMPDLKNVKSK Mys: G52: ru3
#SRLQTAPVPMPDLKNVKSK.13 SRLQTAPVPMPDLKNVKSK Mys: G52: ru1
#SRLQTAPVPMPDLKNVKSK.11 SRLQTAPVPMPDLKNVKSK Mys: G52: ru2
#SRLQTAPVPMPDLKNVKSKIGSTENLK.14 SRLQTAPVPMPDLKNVKSKIGSTENLK Mys: G52: ru3
#SRLQTAPVPMPDLKNVKSKIGSTENLK.15 SRLQTAPVPMPDLKNVKSKIGSTENLK Mys: G52: ru2
#VQIINKKLDLSNVQSK.4 VQIINKKLDLSNVQSK Mys: G52: ru1
#VQIINKKLDLSNVQSK.5 VQIINKKLDLSNVQSK Mys: G52: ru2
# val
#SKDGTGSDDKK.1 1512864.44
#SKDGTGSDDKK.2 64223793.80
#SKDGTGSDDKK.3 9767666215.00
#SRLQTAPVPMPDLKNVKSK.12 30553282.01
#SRLQTAPVPMPDLKNVKSK.13 317403612.90
#SRLQTAPVPMPDLKNVKSK.11 1442983324.00
#SRLQTAPVPMPDLKNVKSKIGSTENLK.14 43701608.00
#SRLQTAPVPMPDLKNVKSKIGSTENLK.15 1612410048.00
#VQIINKKLDLSNVQSK.4 775987137.80
#VQIINKKLDLSNVQSK.5 1784901841.00
他们的关键是as.numeric(as.character(...)
只有在我们使用options(digits=9)
时才会正常工作(请参阅here)。
答案 2 :(得分:1)
如果您假设字符串没有重复的键值(在您的情况下不是真的),则以下内容将起作用:
pre_repeated <-
with(df,aggregate(x=list(key=key),by=list(string=string),FUN=length))
repeated <- pre_repeated[which(pre_repeated$key>1),]
df[which(df$string %in% repeated$string),]
如果您可以告诉我在字符串有重复键的情况下您想要做什么,我可以给您一个更有用的答案。如果要忽略重复项,可以用函数(x)长度(唯一(x))替换FUN参数。
谢谢!
答案 3 :(得分:1)
根据列a中每个值的列b中唯一值的数量过滤数据框:
df %>% group_by(string) %>% filter(length(unique(key)) > 1) %>% arrange(string, val)
输出:
# A tibble: 5 x 3
# Groups: string [2]
string key val
<fctr> <fctr> <fctr>
1 SKDGTGSDDKK Mys: G52: ru1 1512864.443
2 SKDGTGSDDKK Mys: G52: ru2 64223793.8
3 SKDGTGSDDKK Mys: G52: ru3 9767666215
4 VQIINKKLDLSNVQSK Mys: G52: ru2 1784901841
5 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
只获取上述唯一的string
值:
df %>% group_by(string) %>%
filter(length(unique(key)) > 1) %>%
select(string) %>%
distinct()
输出:
# A tibble: 2 x 1
# Groups: string [2]
string
<fctr>
1 SKDGTGSDDKK
2 VQIINKKLDLSNVQSK
总结以了解保留每个人的原因:
df %>% group_by(string) %>%
filter(length(unique(key)) > 1) %>%
summarize(reason = paste0(
"is in ", n(), " of them (",
paste(sub("Mys: G52: ", "", key), collapse = ", "), ")"
))
输出:
# A tibble: 2 x 2
string reason
<fctr> <chr>
1 SKDGTGSDDKK is in 3 of them (ru1, ru2, ru3)
2 VQIINKKLDLSNVQSK is in 2 of them (ru1, ru2)