我有一个从The Human Protein Atlas下载的数据集,其中包含12,004种蛋白质的亚细胞定位注释。这个文件我的子集只包括"基因名称"然后是4列,以确定该位置的可靠性(基于免疫荧光染色的细胞)。论文经过验证">"支持">"已批准">"不确定"。
我想出了一个评分系统我想申请LC-MS光谱计数数据集我1)称量注释质量和2)惩罚蛋白质在{{{ 3}}。
TLDR是我需要计算以下数据集的每列中有多少项,并获取此信息的数据帧。
df <- read.csv("proteinAtlas.csv")
dput(df)
structure(list(Gene_symbol = structure(1:49, .Label = c("AAAS",
"AAMP", "AAR2", "AARD", "AARS", "AARS2", "AARSD1", "ABCA13",
"ABCB6", "ABCB7", "ABCB8", "ABCC1", "ABCC4", "ABCD3", "ABCE1",
"ABCF1", "ABCF2", "ABCF3", "ABHD10", "ABHD14B", "ABHD6", "ABI1",
"ABI2", "ABL2", "ACAA1", "ACAA2", "ACACA", "ACAD9", "ACADM",
"ACADS", "ACADVL", "ACAP1", "ACAP2", "ACAT1", "ACAT2", "ACBD3",
"ACBD5", "ACIN1", "ACLY", "ACO2", "ACOT1", "ACOT13", "ACOT2",
"ACOT7", "ACOT8", "ACOT9", "ACOX1", "ACP1", "ACP5"), class = "factor"),
Validated = structure(c(1L, 2L, 1L, 1L, 2L, 4L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
5L, 1L, 1L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 5L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L), .Label = c("", "Cytosol",
"Golgi apparatus", "Mitochondria", "Peroxisomes", "Vesicles"
), class = "factor"), Supported = structure(c(1L, 9L, 1L,
1L, 1L, 1L, 1L, 1L, 5L, 10L, 10L, 12L, 1L, 1L, 1L, 1L, 4L,
1L, 1L, 6L, 1L, 3L, 1L, 11L, 1L, 10L, 2L, 1L, 1L, 10L, 10L,
1L, 1L, 1L, 4L, 8L, 1L, 11L, 7L, 10L, 1L, 1L, 1L, 4L, 13L,
1L, 1L, 1L, 1L), .Label = c("", "Actin filaments;Cytosol",
"Cell Junctions;Plasma membrane", "Cytosol", "Cytosol;Mitochondria;Nucleoplasm;Plasma membrane",
"Cytosol;Nucleoli;Nucleus", "Cytosol;Nucleoplasm;Plasma membrane",
"Golgi apparatus", "Microtubules", "Mitochondria", "Nucleoplasm",
"Plasma membrane", "Vesicles"), class = "factor"), Approved = structure(c(3L,
1L, 5L, 12L, 1L, 1L, 6L, 4L, 1L, 1L, 17L, 1L, 8L, 1L, 1L,
1L, 1L, 7L, 13L, 1L, 16L, 1L, 15L, 1L, 1L, 1L, 14L, 1L, 1L,
15L, 17L, 18L, 11L, 1L, 17L, 1L, 1L, 1L, 1L, 1L, 13L, 2L,
13L, 15L, 13L, 9L, 17L, 10L, 5L), .Label = c("", "Cell Junctions",
"Centrosome;Cytosol;Nuclear membrane", "Centrosome;Cytosol;Vesicles",
"Cytosol", "Cytosol;Nuclear membrane", "Cytosol;Nucleoli",
"Cytosol;Nucleoli;Plasma membrane", "Cytosol;Nucleoplasm;Plasma membrane",
"Cytosol;Nucleus", "Endosomes", "Lipid droplets", "Mitochondria",
"Nucleoli fibrillar center", "Nucleoplasm", "Nucleoplasm;Vesicles",
"Nucleus", "Vesicles"), class = "factor"), Uncertain = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "Cytosol;Plasma membrane", "Nucleoli"
), class = "factor")), .Names = c("Gene_symbol", "Validated",
"Supported", "Approved", "Uncertain"), class = "data.frame", row.names = c(NA,
-49L))
所以理想的输出看起来像这个image of proposed scoring system,或者,如果你愿意,那么dput():
structure(list(Gene_symbol = structure(1:29, .Label = c("AAAS",
"AAMP", "AAR2", "AARD", "AARS", "AARS2", "AARSD1", "ABCA13",
"ABCB6", "ABCB7", "ABCB8", "ABCC1", "ABCC4", "ABCD3", "ABCE1",
"ABCF1", "ABCF2", "ABCF3", "ABHD10", "ABHD14B", "ABHD6", "ABI1",
"ABI2", "ABL2", "ACAA1", "ACAA2", "ACACA", "ACAD9", "ACADM"), class = "factor"),
Validated = c(NA, 1L, NA, NA, 1L, 1L, NA, NA, 1L, NA, NA,
NA, NA, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, 1L, 1L), Supported = c(NA, 1L, NA, NA, NA, NA, NA, NA,
4L, 1L, 1L, 1L, NA, NA, NA, NA, 1L, NA, NA, 3L, NA, 2L, NA,
1L, NA, 1L, 2L, NA, NA), Approved = c(3L, NA, 1L, 1L, NA,
NA, 2L, 3L, NA, NA, 1L, NA, 3L, NA, NA, NA, NA, 2L, 1L, NA,
2L, NA, 1L, NA, NA, NA, 1L, NA, NA), Uncertain = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Gene_symbol",
"Validated", "Supported", "Approved", "Uncertain"), class = "data.frame", row.names = c(NA,
-29L))
在每一列的大部分内容中,它是由&#34;;&#34;分隔的字符串。然而,在某些情况下,他们的术语类似于&#34; Nucleoli fibrillar center&#34;或&#34;脂滴&#34;由空格分隔,应计为一个单词/术语
我找到了figure的例子:
d <- "foo,bar,fun"
length(strsplit(d,",")[[1]]
class(d)
但这仅适用于&#34;字符&#34;而不是&#34; data.frame&#34;。
有人可以建议如何在R中执行此操作吗? 非常感谢!
答案 0 :(得分:0)
我们可以使用str_count
。循环除第一个(lapply(df[-1], ..
)之外的列,得到;
加1的计数,检查有空字符串的情况并用NA
替换这些元素
library(stringr)
df[-1] <- lapply(df[-1], function(x) (str_count(x, ";") + 1) * NA^(as.character(x) == ""))
答案 1 :(得分:0)
使用base
的解决方案:
result_df <- data.frame(t(apply(df,1,function(x){
c(x[1],sapply(strsplit(as.character(x[-1]),";"),length))
})), stringsAsFactors = F)
names(result_df) <- c("Gene_symbol", "Validated", "Supported", "Approved", "Uncertain")