Question

我的任务是提取列中的所有值＆＃34; 2＆＃34;在按另一列中的因子水平排序之后＆＃34; 3＆＃34; （感兴趣的是，我按生物分类fasta序列）。我正在使用这个非常简单的代码来获得我需要的东西。

df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells 
# df is availabe at the bottom of this post

# splitting by factors
list1 <- split(df, df$V3) 

# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2)) 

# writing results to file
for (x in names(list2))
  write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)

小df的效果很好。但是，输出文件包含几千兆字节的数据。我尝试了一个子集（我的本地机器上有500,000行，内存为8GB），但是第二个命令非常慢（或者R只是挂起）。所以我想知道并且问社区，是否有更好的方法来解决这个问题。谢谢。

这是df：

dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"), 
    V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L, 
    22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L, 
    18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L, 
    11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L, 
    6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137", 
    "M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769", 
    "M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092", 
    "M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660", 
    "M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462", 
    "M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225", 
    "M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851", 
    "M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737", 
    "M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908", 
    "M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701", 
    "M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313", 
    "M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350", 
    "M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490", 
    "M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472", 
    "M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472", 
    "M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142", 
    "M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466", 
    "M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388", 
    "M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179", 
    "M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536", 
    "M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965", 
    "M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257", 
    "M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026", 
    "M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"), 
    V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L, 
    1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L, 
    138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L, 
    1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L, 
    1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L, 
    0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L, 
    NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L, 
    NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L, 
    135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L, 
    98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L, 
    1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L, 
    11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L, 
    15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L, 
    19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,", 
    "1121403,", "1167006,", "1178540,", "1196029,", "1232437,", 
    "1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,", 
    "1536652,", "1541959,", "177437,", "406818,", "452637,", 
    "468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,", 
    "926550,", "96561,"), class = "factor")), .Names = c("V1", 
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA, 
-46L))

Answer 1

使用data.table包与write.table结合使用。

按V3订购，然后为V2中的每个组分别编写V3列。

library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")),  by = V3]

Answer 2

这对我有用，但我不能说你的机器有多快。

lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))

加速循环（从数据框中提取特定值）

2 个答案: