我有一个包含6000个基因名称的数据集。它有6种不同的功能A,B,C,D,E,F,具有唯一的位置编号。我想将功能A的列除以87,将功能B除以54。
最后,我希望得到新列中每行的总和和平均值。 我怎么能在R中做到这一点?
private void setDays(String input) {
for (int i=0; i<input.length(); i++) {
switch (input.charAt(i)) {
case '1':
meetDays += "Sunday ";
break;
case '2':
meetDays += "Monday ";
break;
// and so on for the remaining days
default:
//throw the exception here
}
}
meetDays = meetDays.trim();
}
输入文件
feature_A=87
feature_B=54
输出文件
feature pos gene_1 gene_2 gene_3 gene_n
A 1 6 2 51 0
A 2 4 5 8 2
A 3 1 74 5 0
B 1 11 2 41 89
B 2 4 5 3 5
答案 0 :(得分:4)
通过将除数合并到主数据集中可以更容易实现这一点:
feat_div <- data.frame(feature=c("A","B"), value=c(87,54))
# feature value
#1 A 87
#2 B 54
cols <- grepl("^gene_", names(dat))
dat <- merge(dat, feat_div)
dat[cols] <- lapply(dat[cols], `/`, dat$value)
dat$sum_all <- rowSums(dat[cols])
dat$average_all <- rowMeans(dat[cols])
# feature pos gene_1 gene_2 gene_3 gene_n value sum_all average_all
#1 A 1 0.06896552 0.02298851 0.58620690 0.00000000 87 0.6781609 0.1695402
#2 A 2 0.04597701 0.05747126 0.09195402 0.02298851 87 0.2183908 0.0545977
#3 A 3 0.01149425 0.85057471 0.05747126 0.00000000 87 0.9195402 0.2298851
#4 B 1 0.20370370 0.03703704 0.75925926 1.64814815 54 2.6481481 0.6620370
#5 B 2 0.07407407 0.09259259 0.05555556 0.09259259 54 0.3148148 0.0787037
答案 1 :(得分:1)
dplyr
可以做你想要的,如果你使用查找表等来索引要除以的数字:
library(dplyr)
# make a lookup vector
feat_num <- c(A = 87, B = 54)
feat_num
##
## A B
## 87 54
# group by feature and pos so they don't get divided
df %>% group_by(feature, pos) %>%
# divide everything but grouping variables (.) by the number looked up from feat_num
mutate_each(funs(. / feat_num[feature])) %>%
# ungroup so next mutate works nicely
ungroup() %>%
# add row sum and mean columns, indexing out the first and second columns
mutate(sum_all = rowSums(.[-1:-2]),
average_all = rowMeans(.[-1:-2]))
##
## Source: local data frame [5 x 8]
##
## feature pos gene_1 gene_2 gene_3 gene_n sum_all average_all
## (fctr) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 A 1 0.06896552 0.02298851 0.58620690 0.00000000 0.6781609 0.1695402
## 2 A 2 0.04597701 0.05747126 0.09195402 0.02298851 0.2183908 0.0545977
## 3 A 3 0.01149425 0.85057471 0.05747126 0.00000000 0.9195402 0.2298851
## 4 B 1 0.20370370 0.03703704 0.75925926 1.64814815 2.6481481 0.6620370
## 5 B 2 0.07407407 0.09259259 0.05555556 0.09259259 0.3148148 0.0787037
答案 2 :(得分:0)
首先想到的是将数据拆分为两个数据框,计算您的值,然后重新组合它们。假设您的原始数据框为df
:
df_A <- df[df$feature == 'A', -1]
df_A <- apply(df_A, 2, function(x) x/87)
df_B <- df[df$feature == 'B', -1]
df_B <- apply(df_B, 2, function(x) x/54)
df2 <- rbind(df_A, df_B)
df2$sum_all <- rowSums(df2)
df2$avg_all <- rowMeans(df2)
df2 <- cbind(df2, df$feature)
这应该将每列用'A'作为特征除以87,将每列用'B'除以54作为特征,重新组合行并找到每行的和和均值。如果有任何问题请告诉我,因为我现在无法测试。
编辑:看起来其他答案更好,imo。
答案 3 :(得分:0)
以下是使用data.table
的选项。将feature
向量转换为data.frame(&#39; d2&#39;),将它们放在list
,stack
到data.frame
,然后更改列姓名和transform
&#39;功能&#39;用于删除前缀feature_
的列。在转换&#39; d1&#39;之后,我们join
此数据集与原始数据集(&#39; d1&#39;)到data.table
(setDT(d1)
)并更改&#39;基因&#39;的列类型从integer
到numeric
的列。我们加入了on
&#39;功能&#39;栏by
.EACHI
,循环浏览.SDcols
中指定的列,除以&#39;值&#39;,得到行明智的总和(Reduce(...
)和平均值(&#39; x3&#39;),并将输出分配(:=
)到指定的名称。
library(data.table)
d2 <- transform(setNames(stack(mget(paste("feature", c("A", "B"),
sep="_")))[2:1], c("feature", "value")),
feature = sub(".*_", "", feature))
nm1 <- grep("^gene", names(d1), value = TRUE)
setDT(d1)[, (nm1):= lapply(.SD, as.numeric), .SDcols = nm1]
d1[d2, c(nm1, "sum_all", "average_all") :={
x1 <- lapply(.SD, `/`, value)
x2 <- Reduce(`+`, x1)
x3 <- x2/length(nm1)
c(x1, list(x2, x3))} ,
on = "feature", by = .EACHI,.SDcols = nm1][]
# feature pos gene_1 gene_2 gene_3 gene_n sum_all average_all
#1: A 1 0.06896552 0.02298851 0.58620690 0.00000000 0.6781609 0.1695402
#2: A 2 0.04597701 0.05747126 0.09195402 0.02298851 0.2183908 0.0545977
#3: A 3 0.01149425 0.85057471 0.05747126 0.00000000 0.9195402 0.2298851
#4: B 1 0.20370370 0.03703704 0.75925926 1.64814815 2.6481481 0.6620370
#5: B 2 0.07407407 0.09259259 0.05555556 0.09259259 0.3148148 0.0787037
d1 <- structure(list(feature = c("A", "A", "A", "B", "B"), pos = c(1L,
2L, 3L, 1L, 2L), gene_1 = c(6L, 4L, 1L, 11L, 4L), gene_2 = c(2L,
5L, 74L, 2L, 5L), gene_3 = c(51L, 8L, 5L, 41L, 3L), gene_n = c(0L,
2L, 0L, 89L, 5L)), .Names = c("feature", "pos", "gene_1", "gene_2",
"gene_3", "gene_n"), class = "data.frame", row.names = c(NA, -5L))
答案 4 :(得分:0)
您可以将每列除以平均值,或者如果您想将其除以特定数字,请用该数字或mean(x)
,std(x)
替换median(x)
。
data(mtcars);head(mtcars)
mtcars[] <- lapply(mtcars,function(x) x / mean(x))
与同一个标记一样,您可以按如下所示对单个列执行此操作
data(mtcars);head(mtcars)
mtcars$mpg <- lapply(mtcars$mpg ,function(x) x / mean(x))