我有一个数据框:
df<-structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
pos = c(10L, 200L, 134L, 400L, 600L, 1000L, 20L, 33L, 40L,
45L, 50L, 55L, 100L, 123L)), .Names = c("chrom", "pos"), row.names = c(NA, -14L), class = "data.frame")
> head(df)
chrom pos
1 1 10
2 1 200
3 1 134
4 1 400
5 1 600
6 1 1000
我想计算样本染色体上的pos[i+1]
- pos[i]
(chrom
)
通过在每个chrom
级别上使用for循环,在每行上使用另一个循环,我得到了预期的结果:
for (c in levels(df$chrom)){
df_chrom<-filter(df, chrom == c)
df_chrom<-arrange(df_chrom, df_chrom$pos)
for (i in 1:nrow(df_chrom)){
dist<-(df_chrom$pos[i+1] - df_chrom$pos[i])
logdist<-log10(dist)
cat(c, i, df_chrom$pos[i], dist, logdist, "\n")
}
}
但是,我想将其保存到数据框,并认为lapply
或apply
是正确的方法。我无法弄清楚如何进行pos[i+1]
- pos[i]
计算(看作lapply
适用于每一行/每列。
任何指针都将不胜感激
以下是我的解决方案的输出:
chrom index pos dist log10dist
1 1 10 124 2.093422
1 2 134 66 1.819544
1 3 200 200 2.30103
1 4 400 200 2.30103
1 5 600 400 2.60206
1 6 1000 NA NA
2 1 20 13 1.113943
2 2 33 NA NA
3 1 40 5 0.69897
3 2 45 NA NA
4 1 50 5 0.69897
4 2 55 45 1.653213
4 3 100 23 1.361728
4 4 123 NA NA
答案 0 :(得分:1)
我们可以使用一组差异来做到这一点。转换&#39; data.frame&#39;到&#39; data.table&#39; (setDT(df)
),按&#39; <&#39;,order
&#39; pos&#39;分组,得到&#39; pos&#39; (diff
)和差异的log
library(data.table)
setDT(df)[order(pos), {v1 <- diff(pos)
.(index = seq_len(.N), pos = pos,
dist = c(v1, NA), logdiff = c(log10(v1), NA))}
, by = chrom]
# chrom index pos dist logdiff
# 1: 1 1 10 124 2.093422
# 2: 1 2 134 66 1.819544
# 3: 1 3 200 200 2.301030
# 4: 1 4 400 200 2.301030
# 5: 1 5 600 400 2.602060
# 6: 1 6 1000 NA NA
# 7: 2 1 20 13 1.113943
# 8: 2 2 33 NA NA
# 9: 3 1 40 5 0.698970
#10: 3 2 45 NA NA
#11: 4 1 50 5 0.698970
#12: 4 2 55 45 1.653213
#13: 4 3 100 23 1.361728
#14: 4 4 123 NA NA
运行OP代码后,打印的输出为
#1 1 10 124 2.093422
#1 2 134 66 1.819544
#1 3 200 200 2.30103
#1 4 400 200 2.30103
#1 5 600 400 2.60206
#1 6 1000 NA NA
#2 1 20 13 1.113943
#2 2 33 NA NA
#3 1 40 5 0.69897
#3 2 45 NA NA
#4 1 50 5 0.69897
#4 2 55 45 1.653213
#4 3 100 23 1.361728
#4 4 123 NA NA
答案 1 :(得分:1)
我们将df
拆分为df$chrom
(请注意,我们会在拆分前重新排序df
和df$chrom
)。然后我们使用a
遍历每个子组(在此示例中,子组称为lapply
)。在每个子组的pos
列上,我们计算连续元素的差异(diff
)并取log10
。由于diff
会将元素数量减少1,因此我们会在末尾添加NA
。最后,我们使用rbind
do.call
将所有子组do.call(rbind, lapply(split(df[order(df$chrom, df$pos),], df$chrom[order(df$chrom, df$pos)]),
function(a) data.frame(a, dist = c(log10(diff(a$pos)), NA))))
# chrom pos dist
#1.1 1 10 2.093422
#1.3 1 134 1.819544
#1.2 1 200 2.301030
#1.4 1 400 2.301030
#1.5 1 600 2.602060
#1.6 1 1000 NA
#2.7 2 20 1.113943
#2.8 2 33 NA
#3.9 3 40 0.698970
#3.10 3 45 NA
#4.11 4 50 0.698970
#4.12 4 55 1.653213
#4.13 4 100 1.361728
#4.14 4 123 NA
组合在一起。
SELECT
bucket_index, user_type, medium, AVG(productivity) as avg_prod,
MAX(productivity) as max_prod, MIN(productivity) as min_prod
FROM
(SELECT
floor(t1.total_docs_read/100) as bucket_index,
t2.user_type as user_type, t2.medium as medium,
t2.productivity as productivity
FROM
(SELECT
on_date::date as day, sum(docs_read) as total_docs_read
FROM work
GROUP BY day) as t1,
(SELECT
on_date::date as day, user_type, medium,
sum(docs_read)/count(distinct(user)) as productivity
FROM work
GROUP BY day, user_type, medium) as t2
WHERE t1.day=t2.day) as t3
GROUP BY bucket_index, user_type, medium