使用R,检索包含某些字符串的文件名的文件

时间:2015-01-12 07:00:26

标签: regex r

我有来自某个目录的数千个文件:

filenames <- list.files("D:/MessData_Source", pattern="*.DAT", full.names=TRUE)
.....
.....

     [9998] "D:/MessData_Source/908-A0F7__01310012567794F.DAT" 
     [9999] "D:/MessData_Source/908-A0F7__01310015662858F.DAT" 
    [10000] "D:/MessData_Source/908-A0F7__01310015662859F.DAT"

....
....

在超过1000个文件中,我只需要提取那些文件名包含某些字符串的文件。
e.g。

filename_extracted <- list()
for (i in 1:length(filenames))
{
  # search for those filenames that contain the strings with PartNo and MoNo and store in results
  filename_extracted[[i]] <- substr(filenames[i],31,43)
} 

上面我从31号到43号提取文件名字符串并将其存储在filename_extracted中,如下所示:

 [[9993]]
 [1] "1856955908850"

 [[9994]]
 [1] "1856955933372"

 [[9995]]
 [1] "1856955933372"

 [[9996]]
 [1] "1856955954613"

 [[9997]]
 [1] "1856955954613"

 [[9998]]
 [1] "1310012567794"

 [[9999]]
 [1] "1310015662858"

 [[10000]]
 [1] "1310015662859"

接下来,我需要将filename_extracted与我的必需列表进行比较,并将这些匹配的文件复制到另一个目录。

required_list <- list()
df <-read.csv("PartNo_MoNo.csv")  # full set
for (i in 1:length(df))
{
  required_list[[i]] <- paste(df[i,1],df[i,2], sep="")
}
> required_list
[[1]]
[1] "1235235987252"

[[2]]
[1] "1897865985468"

如果required_list和filename_extracted之间存在匹配,我想将匹配的文件复制到另一个目录,我该怎么办?

感谢。

2 个答案:

答案 0 :(得分:1)

这是更新的代码,完全矢量化:

filename_extracted = substr(filenames, start=31, stop=43)
prefix             = substr(filesnames, start=20, stop=30)
required_list      = paste0(df[,1], df[,2])

common_suffix      = intersect(filename_extracted, required_list)
common_prefix      = prefix[filename_extracted %in% common]

storeDir = "D:/MessData_Source"
otherDir = "D:/OrderedData_Source"

if(length(common!=0))
{
    commonFile = paste0(common_prefix, common_suffix, ".DAT")

    sapply(commonFile, function(u){
        file.copy(file.path(storeDir,u), file.path(otherDir, u))
    })
}

在执行此操作之前,请确保已创建otherDir

答案 1 :(得分:0)

# Create data
library(stringr)

lapply(1:10, function(x){
  write.csv(head(iris),file=paste0("908-A0F7__",x,".csv"))
  write.csv(head(iris),file=paste0("notused__",x,".csv"))
})
# Only get files with correct pattern
pattern = "908-A0F7__(\\d+).csv"
files = data.frame(name=dir(pattern=pattern,full.names=TRUE))
files$num = as.integer(str_match(filenames$name,pattern)[,2])
required = c(1,3,5) # You can also read this in from your csv

myFiles = files[files$num %in% required,]

dir.create("copied")
file.copy(as.character(myFiles$name),file.path("copied",str_sub(myFiles$name,3)))