我使用2个不同的帧(从0到100000及以后,从50000到150000及以后)以间隔(100000)对数据进行分类。然后我加入了两个数据帧,使用一列作为帧的标识符(以“x100kb”列表示)。
就我的目的而言,如果2行(编辑:它们不需要彼此连续;因为数据现在没有被“chr”和“x100kb”排序)不同在“x100kb”中为0.5(最好将整数与它们的+0.5进行比较;例如:60到60.5,65到65.5;等等)但它们在“chr”和“occurrences_norm”和“occurrences_tum”中具有相同的值;然后他们是平等的,我想删除其中一个。现在唯一想到的就是循环,这显然不是很有效率......
数据示例:
chr x100Kb occurrences_norm occurrences_tum fold
19064 chr17 61.5 17 0 14.05333
38799 chr5 526.0 16 0 13.96587
38800 chr5 526.5 16 0 13.96587
39946 chr5 1113.5 16 0 13.96587
2377 chr1 1426.0 15 0 13.87277
21859 chr18 733.5 15 0 13.87277
20538 chr18 24.0 14 0 13.77324
21863 chr18 735.5 14 0 13.77324
37699 chr4 1835.5 14 0 13.77324
39924 chr5 1102.5 14 0 13.77324
21506 chr18 550.5 13 0 13.66633
21862 chr18 735.0 13 0 13.66633
22258 chr19 151.5 13 0 13.66633
38972 chr5 613.0 13 0 13.66633
41707 chr6 194.5 13 0 13.66633
2380 chr1 1427.5 12 0 13.55087
20541 chr18 25.5 12 0 13.55087
21252 chr18 421.0 12 0 13.55087
27384 chr2 2243.0 12 0 13.55087
39990 chr5 1135.5 12 0 13.55087
在示例中,第3行将被删除。
答案 0 :(得分:2)
我以不同的方式阅读了这个问题。我认为我们需要比较任何两个后续行。例如,检查第1行和第1行。 2,第2行& 3,依此类推。我还认为条件是x100Kb的差异是0.5,不大于0.5。我认为使用shift()
进行四次逻辑检查是实现目标的一种方法。
setDT(df1)[!((abs(x100Kb - shift(x100Kb, type = "lag", fill = -Inf)) == 0.5) &
(chr == shift(chr, type = "lag")) &
(occurrences_norm == shift(occurrences_norm, type = "lag")) &
(occurrences_tum == shift(occurrences_tum, type = "lag")))
]
# chr x100Kb occurrences_norm occurrences_tum fold
# 1: chr17 61.5 17 0 14.05333
# 2: chr5 526.0 16 0 13.96587
# 3: chr5 1113.5 16 0 13.96587
# 4: chr1 1426.0 15 0 13.87277
# 5: chr18 733.5 15 0 13.87277
# 6: chr18 24.0 14 0 13.77324
# 7: chr18 735.5 14 0 13.77324
# 8: chr4 1835.5 14 0 13.77324
# 9: chr5 1102.5 14 0 13.77324
#10: chr18 550.5 13 0 13.66633
#11: chr18 735.0 13 0 13.66633
#12: chr19 151.5 13 0 13.66633
#13: chr5 613.0 13 0 13.66633
#14: chr6 194.5 13 0 13.66633
#15: chr1 1427.5 12 0 13.55087
#16: chr18 25.5 12 0 13.55087
#17: chr18 421.0 12 0 13.55087
#18: chr2 2243.0 12 0 13.55087
#19: chr5 1135.5 12 0 13.55087
答案 1 :(得分:1)
我们也可以data.table
library(data.table)
setDT(df1)[df1[, .I[abs(x100Kb - shift(x100Kb, fill = -Inf)) > 0.5] ,
by = .(chr, occurrences_norm, occurrences_tum)]$V1]
# chr x100Kb occurrences_norm occurrences_tum fold
# 1: chr17 61.5 17 0 14.05333
# 2: chr5 526.0 16 0 13.96587
# 3: chr5 1113.5 16 0 13.96587
# 4: chr1 1426.0 15 0 13.87277
# 5: chr18 733.5 15 0 13.87277
# 6: chr18 24.0 14 0 13.77324
# 7: chr18 735.5 14 0 13.77324
# 8: chr4 1835.5 14 0 13.77324
# 9: chr5 1102.5 14 0 13.77324
#10: chr18 550.5 13 0 13.66633
#11: chr18 735.0 13 0 13.66633
#12: chr19 151.5 13 0 13.66633
#13: chr5 613.0 13 0 13.66633
#14: chr6 194.5 13 0 13.66633
#15: chr1 1427.5 12 0 13.55087
#16: chr18 25.5 12 0 13.55087
#17: chr18 421.0 12 0 13.55087
#18: chr2 2243.0 12 0 13.55087
#19: chr5 1135.5 12 0 13.55087
答案 2 :(得分:0)
使用dplyr
包
library(dplyr)
df1 %>% group_by(chr,occurrences_norm,occurrences_tum) %>%
mutate(tmp=diff(c(0,x100Kb))) %>% filter(tmp>0.5) %>% select(-tmp)
# chr x100Kb occurrences_norm occurrences_tum fold
# (fctr) (dbl) (int) (int) (dbl)
# 1 chr17 61.5 17 0 14.05333
# 2 chr5 526.0 16 0 13.96587
# 3 chr5 1113.5 16 0 13.96587
# 4 chr1 1426.0 15 0 13.87277
# 5 chr18 733.5 15 0 13.87277
# 6 chr18 24.0 14 0 13.77324
# 7 chr18 735.5 14 0 13.77324
# 8 chr4 1835.5 14 0 13.77324
# 9 chr5 1102.5 14 0 13.77324
# 10 chr18 550.5 13 0 13.66633
# 11 chr18 735.0 13 0 13.66633
# 12 chr19 151.5 13 0 13.66633
# 13 chr5 613.0 13 0 13.66633
# 14 chr6 194.5 13 0 13.66633
# 15 chr1 1427.5 12 0 13.55087
# 16 chr18 25.5 12 0 13.55087
# 17 chr18 421.0 12 0 13.55087
# 18 chr2 2243.0 12 0 13.55087
# 19 chr5 1135.5 12 0 13.55087