我的任务是提取列中的所有值" 2"在按另一列中的因子水平排序之后" 3" (感兴趣的是,我按生物分类fasta序列)。我正在使用这个非常简单的代码来获得我需要的东西。
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
小df的效果很好。但是,输出文件包含几千兆字节的数据。我尝试了一个子集(我的本地机器上有500,000行,内存为8GB),但是第二个命令非常慢(或者R只是挂起)。 所以我想知道并且问社区,是否有更好的方法来解决这个问题。谢谢。
这是df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))
答案 0 :(得分:2)
使用data.table包与write.table
结合使用。
按V3
订购,然后为V2
中的每个组分别编写V3
列。
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]
答案 1 :(得分:1)
这对我有用,但我不能说你的机器有多快。
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))