我有超过10000个文件。我首先将我的目录设置为文件所在的文件夹。
然后我链接到.txt
格式的所有文件,如此
filenames <- list.files("path to the file", pattern="*.txt", full.names=TRUE)
然后我用fread
ldf<- lapply(filenames, FUN=fread, header=TRUE)
为什么要畏惧?实际上,当我使用data.table
时,它会混淆,例如,我必须添加sep","
和row.names=FALSE
等。如果您了解更好的方法,请继续并提出建议。无论如何
在我这样做之后,我最终获得了一个巨大的列表,我现在需要从中提取数据。例如,我试图在下面制作代表性数据
当然,在实际数据中,每个文件中有更多列,只有三个名为check
和myfile
以及Myname
现在我尝试通过以下命令仅保留列myfile
和Myname
。
t<- lapply(ldf, `[`, c(2,3))
my.list <- list(structure(list(check = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE), myfile = c("", "1xLabel:13C(6)15N(4) [R11]", "1xOxidation [M7]",
"", "1xLabel:13C(6)15N(4) [R11]", ""), myname = c("Q9Y383", "Q9Y383",
"Q9Y383", "Q15366-2", "Q15366-2", "Q15366-2")), .Names = c("check",
"myfile", "myname"), row.names = c(NA, -6L), class = c("data.table",
"data.frame")), structure(list(
check = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
), myfile = c(NA, NA, NA, NA, NA, NA, NA), Myname = c("F8W727",
"O76021", "P46783", "P35527", "Q96C45", "Q9Y383", "Q9Y383"
)), .Names = c("check", "myfile", "myname"), row.names = c(NA,
-7L), class = c("data.table", "data.frame")),
structure(list(check = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), myfile = c("",
"2xLabel:13C(6)15N(4) [R6; R8]; 1xCarbamidomethyl [C4]",
"", "", "", "1xCarbamidomethyl [C1]", "", "", "", "", "1xLabel:13C(6)15N(4) [R6]; 1xCarbamidomethyl [C5]"
), myname = c("P39019", "A2A3R5; P62753", "Q8IYB3; E9PCT1; M0R088; A9Z1X7; Q8IYB3-2",
"S4R3J4; O43390-3; B4DT28; O43390; O43390-2; O60506; O60506-2; E7ETM7",
"P07910-4; B4DY08; G3V4C1; P07910-2; G3V4W0; P07910; G3V5V7; P07910-3; G3V2D6; G3V2Q1",
"D6R9X9; D6RG19; P61927", "Q00839", "G3XAD8; H0YGI8; P31948; F5H0T1",
"Q8IYB3; E9PCT1; M0R088; A9Z1X7; Q8IYB3-2", "P42766", "Q9NX58; D6RDJ1"
)), .Names = c("check", "myfile", "myname"), row.names = c(NA,
-11L), class = c("data.table", "data.frame")),
structure(list(check = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), myfile = c("",
"", "", "", "1xLabel:13C(6)15N(4) [R7]", "", "", "", "3xLabel:13C(6)15N(4) [R1; R7; R10]",
"", ""), myname = c("P61247", "P39019", "Q9NWH9", "P62917",
"P62917", "E9PCT1", "Q15149", "Q14152", "Q14152", "Q15020",
"Q02543")), .Names = c("check", "myfile", "myname"), row.names = c(NA,
-11L), class = c("data.table", "data.frame")))
我想要什么?
我想检查一下我加载的所有文件中是否有myfile和myname?然后有这样的输出
file1 file2 file3 file4
myfile myname myfile myname myfile myname myfile myname
info info info info info info info info
使其更具可重复性。我希望示例数据输出如下所示
myout<- structure(list(myfile1 = structure(c(NA, 1L, 2L, NA, 1L, NA,
NA, NA, NA, NA, NA), .Label = c("1xLabel:13C(6)15N(4) [R11]",
"1xOxidation [M7]"), class = "factor"), Myname1 = structure(c(2L,
2L, 2L, 1L, 1L, 1L, NA, NA, NA, NA, NA), .Label = c("Q15366-2",
"Q9Y383"), class = "factor"), myfile2 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Myname2 = structure(c(1L, 2L, 4L, 3L,
5L, 6L, 6L, NA, NA, NA, NA), .Label = c("F8W727", "O76021", "P35527",
"P46783", "Q96C45", "Q9Y383"), class = "factor"), myfile3 = structure(c(NA,
3L, NA, NA, NA, 1L, NA, NA, NA, NA, 2L), .Label = c("1xCarbamidomethyl [C1]",
"1xLabel:13C(6)15N(4) [R6]; 1xCarbamidomethyl [C5]", "2xLabel:13C(6)15N(4) [R6; R8]; 1xCarbamidomethyl [C4]"
), class = "factor"), Myname3 = structure(c(5L, 1L, 8L, 10L,
4L, 2L, 7L, 3L, 8L, 6L, 9L), .Label = c("A2A3R5; P62753", "D6R9X9; D6RG19; P61927",
"G3XAD8; H0YGI8; P31948; F5H0T1", "P07910-4; B4DY08; G3V4C1; P07910-2; G3V4W0; P07910; G3V5V7; P07910-3; G3V2D6; G3V2Q1",
"P39019", "P42766", "Q00839", "Q8IYB3; E9PCT1; M0R088; A9Z1X7; Q8IYB3-2",
"Q9NX58; D6RDJ1", "S4R3J4; O43390-3; B4DT28; O43390; O43390-2; O60506; O60506-2; E7ETM7"
), class = "factor"), myfile4 = structure(c(NA, NA, NA, NA, 1L,
NA, NA, NA, 2L, NA, NA), .Label = c("1xLabel:13C(6)15N(4) [R7]",
"3xLabel:13C(6)15N(4) [R1; R7; R10]"), class = "factor"), Myname4 = structure(c(3L,
2L, 9L, 4L, 4L, 1L, 8L, 6L, 6L, 7L, 5L), .Label = c("E9PCT1",
"P39019", "P61247", "P62917", "Q02543", "Q14152", "Q15020", "Q15149",
"Q9NWH9"), class = "factor")), .Names = c("myfile1", "Myname1",
"myfile2", "Myname2", "myfile3", "Myname3", "myfile4", "Myname4"
), class = "data.frame", row.names = c(NA, -11L))
新请求
然后我想将数据拆分为两个数据帧。一个是只保留那些他们的myfile有特殊字符串{my}的mynames和一个mynames,他们的myfiles没有任何东西或者没有那些特殊的字符串
df1
答案 0 :(得分:2)
这是一种方法。您可以在之后更改列名称,并进行其他您喜欢的额外修饰。这是为了解决您的问题的核心,您可以根据自己的喜好打扮。我写了一个辅助函数struct
,它有三个参数;数据框,要添加的行数以及填充它们的内容。
struct