我的朋友今天给了我一个文件,看起来像这样:
ID genotype snp.id
1 PT86 CA 192902098
2 PT8 CA 192902098
3 PT33 TC 191571437
4 PT27 GA 191026838
5 PT2 TG 188482874
6 PT1 GC 186443061
7 PT70 GC 186443061
8 PT59 GA 185444226
9 PT48 GA 185152161
10 PT54 GA 185152161
11 PT18 GA 185152161
12 PT27 GA 185152161
实际上数据有近1000行,我在这里只展示了12行的样本。
他问我是否可以将此文件转换为如下格式:
id rs185152161 rs185444226 rs186443061 rs188482874 rs191026838 rs191571437 rs192902098
1 PT1 <NA> <NA> GC <NA> <NA> <NA> <NA>
2 PT18 GA <NA> <NA> <NA> <NA> <NA> <NA>
3 PT2 <NA> <NA> <NA> TG <NA> <NA> <NA>
and so on....
这就是我这样做的方式。
首先我拿了我需要提取的snp.ids列表
snp.ids=data.frame(table(raw$snp.id))[,1]
然后我用每个snp.id提取数据的子集并放在列表矩阵中。
mat=matrix(list(),ncol=1,nrow=13)
for (i in 1:7) {
mat[[i,1]]=subset(raw,snp.id==snpids[[i]])[,1:2]
names(mat[[i,1]])=c('id',paste("rs",snpids[[i]],sep=""))
}
然后我合并了我提取的所有数据帧。
df1= Reduce(function(x,y) merge(x,y,all=T),mat[1:7,1])
df2=df1[!duplicated(df1$id),]
所以数据看起来像
id rs185152161 rs185444226 rs186443061 rs188482874 rs191026838 rs191571437 rs192902098
1 PT1 <NA> <NA> GC <NA> <NA> <NA> <NA>
2 PT18 GA <NA> <NA> <NA> <NA> <NA> <NA>
3 PT2 <NA> <NA> <NA> TG <NA> <NA> <NA>
4 PT27 GA <NA> <NA> <NA> GA <NA> <NA>
5 PT33 <NA> <NA> <NA> <NA> <NA> TC <NA>
6 PT48 GA <NA> <NA> <NA> <NA> <NA> <NA>
7 PT54 GA <NA> <NA> <NA> <NA> <NA> <NA>
8 PT59 <NA> GA <NA> <NA> <NA> <NA> <NA>
9 PT70 <NA> <NA> GC <NA> <NA> <NA> <NA>
10 PT8 <NA> <NA> <NA> <NA> <NA> <NA> CA
11 PT86 <NA> <NA> <NA> <NA> <NA> <NA> CA
我想知道在不使用这些循环函数的情况下是否有更好的方法可以做到这一点?
答案 0 :(得分:2)
尝试:(dat
是数据集)
library(reshape2)
res <- dcast(dat, ID~snp.id, value.var="genotype")
colnames(res)[-1] <- paste0("rs", colnames(res)[-1])
head(res,3)
# ID rs185152161 rs185444226 rs186443061 rs188482874 rs191026838 rs191571437
#1 PT1 <NA> <NA> GC <NA> <NA> <NA>
#2 PT18 GA <NA> <NA> <NA> <NA> <NA>
#3 PT2 <NA> <NA> <NA> TG <NA> <NA>
# rs192902098
#1 <NA>
#2 <NA>
#3 <NA>
dat <- structure(list(ID = c("PT86", "PT8", "PT33", "PT27", "PT2", "PT1",
"PT70", "PT59", "PT48", "PT54", "PT18", "PT27"), genotype = c("CA",
"CA", "TC", "GA", "TG", "GC", "GC", "GA", "GA", "GA", "GA", "GA"
), snp.id = c(192902098L, 192902098L, 191571437L, 191026838L,
188482874L, 186443061L, 186443061L, 185444226L, 185152161L, 185152161L,
185152161L, 185152161L)), .Names = c("ID", "genotype", "snp.id"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10", "11", "12"))
如果您的数据包含每个组合的多个条目
dat <- structure(list(ID = c("PT86", "PT8", "PT33", "PT27", "PT2", "PT1",
"PT70", "PT59", "PT48", "PT54", "PT18", "PT27", "PT27"), genotype = c("CA",
"CA", "TC", "GA", "TG", "GC", "GC", "GA", "GA", "GA", "GA", "GA",
"GC"), snp.id = c(192902098L, 192902098L, 191571437L, 191026838L,
188482874L, 186443061L, 186443061L, 185444226L, 185152161L, 185152161L,
185152161L, 185152161L, 185152161L)), .Names = c("ID", "genotype",
"snp.id"), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13"))
dcast(dat, ID~snp.id, value.var="genotype") #able to reproduce the problem
#Aggregation function missing: defaulting to length
#----------------------------------
为唯一组合创建index
dat$indx <- with(dat, ave(seq_along(ID), ID, FUN=seq_along))
dcast(dat, ID+indx~snp.id, value.var="genotype")[,-2]
答案 1 :(得分:0)
从长格式到宽格式,始终有reshape
。
library(reshape)
reshape(data, idvar = "ID", timevar = "snp.id", direction = "wide")
导致:
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| ID | genotype.192902098 | genotype.191571437 | genotype.191026838 | genotype.188482874 | genotype.186443061 | genotype.185444226 | genotype.185152161 |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT86 | CA | NA | NA | NA | NA | NA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT8 | CA | NA | NA | NA | NA | NA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT33 | NA | TC | NA | NA | NA | NA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT27 | NA | NA | GA | NA | NA | NA | GA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT2 | NA | NA | NA | TG | NA | NA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT1 | NA | NA | NA | NA | GC | NA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT70 | NA | NA | NA | NA | GC | NA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT59 | NA | NA | NA | NA | NA | GA | NA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT48 | NA | NA | NA | NA | NA | NA | GA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT54 | NA | NA | NA | NA | NA | NA | GA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| PT18 | NA | NA | NA | NA | NA | NA | GA |
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+