也许是一个幼稚的问题,我下载了一堆具有以下结构的不同生物的程序集;
-Parent_folder
--Genus_species_1
---genome_filename_1
---genome_filename_2
---genome_filename_n
--Genus_species_2
---genome_filename_1
---genome_filename_2
---genome_filename_n
--Genus_species_N
---genome_filename_1
---genome_filename_2
---genome_filename_n
我想创建一个表,其中一列具有种类名称,第二列具有程序集的文件名。像这样的东西
colum1 | column2
Genus_species_1 | genome_filename_1
Genus_species_1 | genome_filename_2
Genus_species_1 | genome_filename_n
Genus_species_2 | genome_filename_1
Genus_species_2 | genome_filename_2
Genus_species_2 | genome_filename_n
Genus_species_N | genome_filename_1
Genus_species_N | genome_filename_2
Genus_species_N | genome_filename_n
我尝试了很多事情,但不知道这段代码有什么问题;
#listing the folders containing different number of genomes;
folder_list<- list.dirs(".", full.names = FALSE)
#Remove the parent folder;
folder_list<- folder_list[-1]
#Creating two vectors to populate with the genome filename and another with the species name(same as folder name);
genomes<-NULL
species<- NULL
#Generate a loop to populate;
for (dir in 1:length(folder_list)){
files<- as.vector(list.files(file.path(WD, dir))) #Vector containing all the genome filenames
genomes<- c(genomes, files) #add the one before to the genomes vector
#next, create a vector with the number of the folder(which is the species) and repeat it as much as the number of genomes;
directories<-rep(dir, length(list.files(file.path(WD, dir))))
species<- append(species, directories) #add it to species vector
} #end of the loop
希望有人可以提供帮助!
提前谢谢!
答案 0 :(得分:0)
我对您的文件结构进行了简化。
-Parent_Folder
--Genus_Species_1
---genome_filename1.txt
---genome_filename2.txt
---genome_filename3.txt
---genome_filename4.txt
--Genus_Species_2
---genome_filename1.txt
---genome_filename2.txt
---genome_filename3.txt
下面的代码使用here
包。有关软件包here的更多信息。从本质上讲,它使构造路径更容易(imo)。
library(here)
# save the name of genus directories
GenusDirs <- list.files(here("Parent_Folder"))
GenusDirs
# initialize list to save name of genome files
GenomeFiles <- vector("list", length = length(GenusDirs))
# name elements in the list with Genus names
names(GenomeFiles) <- GenusDirs
# create list of genome files associated with each genus
for (i in 1:length(upperDir)){
GenomeFiles[[i]] <- list.files(here("Parent_Folder", upperDir[i]))
}
# initialize vectors
GenusNames <- c()
GenomeNames <- c()
# in this loop, construct two vectors by appending Genus and Genome names
for (i in 1:length(GenomeFiles)){
# repeat Genus Name by how many Genome files in that Genus directory
GenusNames <- append(GenusNames, rep(names(GenomeFiles[i]), length(GenomeFiles[[i]])))
GenomeNames <- append(GenomeNames, GenomeFiles[[i]])
}
# create data frame
GenusGenomeData <- data.frame(
Genus = GenusNames,
Genome = GenomeNames
)
答案 1 :(得分:0)
我重新创建了目录结构,然后创建了一个data.frame,其中第1列列出了所有子目录,第2列列出了每个子目录中的文件。
首先,这是用于复制目录结构的代码。
# Create parent folder
dir.create("Parent_folder")
# Create child directories and files in one go
sample_dirnames <- seq_len(3)
sapply(sample_dirnames, function(index){
# build dir path and create
dirname <- paste0("Genus_species_", sample_dirnames[index])
dir.create(paste0("Parent_folder/",dirname))
# generate three files in the current directory
sapply(seq_len(3), function(n) {
file.create(paste0("Parent_folder/", dirname, "/genome_filename_", n, ".R"))
})
})
以list.files
函数为起点,使用recursive = TRUE
。然后将输出通过管道传递到data.frame并将路径分为目录和文件名。此示例使用dplyr
包中的函数和基本函数(substring
,gregexpr
)。
# pkg
library(tidyverse)
# build object
files <- list.files("Parent_folder/", recursive = TRUE) %>%
as.data.frame(.) %>%
rename(., "path" = .) %>%
mutate(
column1 = substring(
text = path,
first = 1,
last = as.numeric(
gregexpr(
pattern = "/",
text = path
)[1]
) - 1
),
column2 = substring(
text = path,
first = as.numeric(
gregexpr(
pattern = "/",
text = path
)[1]
) + 1
)
) %>%
select(-path)
这将打印出以下对象
files
# column1 column2
# 1 Genus_species_1 genome_filename_1.R
# 2 Genus_species_1 genome_filename_2.R
# 3 Genus_species_1 genome_filename_3.R
# 4 Genus_species_2 genome_filename_1.R
# 5 Genus_species_2 genome_filename_2.R
# 6 Genus_species_2 genome_filename_3.R
# 7 Genus_species_3 genome_filename_1.R
# 8 Genus_species_3 genome_filename_2.R
# 9 Genus_species_3 genome_filename_3.R
答案 2 :(得分:0)
使用stringr
和dplyr
软件包尝试
library(stringr)
library(dplyr)
file_list<- list.files(".",recursive=T)
> file_list
[1] "Genus_species_1/genome_filename_1" "Genus_species_1/genome_filename_2" "Genus_species_1/genome_filename_n"
[4] "Genus_species_2/genome_filename_1" "Genus_species_2/genome_filename_2" "Genus_species_2/genome_filename_n"
[7] "Genus_species_n/genome_filename_1" "Genus_species_n/genome_filename_2" "Genus_species_n/genome_filename_n"
使用
stringr
包中的str_split_fixed函数将file_list
变量拆分成矩阵,然后使用%>%
中的管道dplyr
将其保存为数据帧{{1} }
df