是否有list.files(path, recursive=TRUE)
的ftp版本?
我想获取此FTP服务器上子目录中ZIP-Archieves的所有URL
url <- "ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/hourly/"
所以我想得到这个目录中所有文件的列表:
ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/hourly/wind/recent/
以及
ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/hourly/air_temperature/historical/
等等
使用RCurl
我设法下载了此目录的dirlist,但是没有获得所有子目录中所有zip-archieves的完整列表。
除了通过目录循环并逐个获取dirlists之外的任何建议吗?
RCurl代码:
dwd_dirlist <- function(url, full = TRUE){
dir <- unlist(
strsplit(
getURL(url,
ftp.use.epsv = FALSE,
dirlistonly = TRUE),
"\n")
)
if(full) dir <- paste0(url, dir)
return(dir)
}
答案 0 :(得分:10)
如果系统上安装了lftp
实用程序,则可以使用其find
命令以递归方式列出指定目录下的文件。这是link to the documentation; find
的说明靠近顶部。
不幸的是,正如您从文档中看到的那样,与普通的Unix find
实用程序不同,lftp
&#39; find
命令不支持很多选项一点都没仅限--max-depth
和--list
(对于长列表),因此您无法使用-name
,-regex
等谓词find
实用程序通常提供另一方面,lftp
确实支持一个非常不寻常但功能强大的功能,因为它允许您将输出管道传输到本地工具,因此您可以将find
输出管道传输到本地grep
。 1}}来自lftp
命令行。当然,没有什么可以阻止你在shell管道中进行grepping,或者在Rland中进行过滤。这是一个使用lftp
管道的示例(正如您所看到的,这种方法的一个缺点是多级转义变得相当复杂):
url <- 'ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/hourly/';
zips <- system(paste0('lftp ',url,' <<<\'find| grep "\\\\.zip$"; exit;\';'),intern=T);
zips;
## [1] "./air_temperature/historical/stundenwerte_TU_00003_19500401_20110331_hist.zip"
## [2] "./air_temperature/historical/stundenwerte_TU_00044_20070401_20141231_hist.zip"
## [3] "./air_temperature/historical/stundenwerte_TU_00052_19760101_19880101_hist.zip"
## [4] "./air_temperature/historical/stundenwerte_TU_00071_20091201_20141231_hist.zip"
##
## ... snip ...
##
## [6616] "./wind/recent/stundenwerte_FF_15207_akt.zip"
## [6617] "./wind/recent/stundenwerte_FF_15214_akt.zip"
## [6618] "./wind/recent/stundenwerte_FF_15444_akt.zip"
## [6619] "./wind/recent/stundenwerte_FF_15520_akt.zip"
另外,只是为了它,如果你想要另一种方法,我已经编写了一个函数,它可以使用正则表达式解析ls -l
列表的输出,返回数据中的所有字段。帧。一个简单的修改允许它使用lftp
:
longListing <- function(url='',recursive=F,all=F) {
## returns a data.frame of long-listing fields
## requires lftp for ftp support
## validate arguments
url <- as.character(url);
if (length(url) != 1L) stop('url argument must have length 1.');
recursive <- as.logical(recursive);
if (length(recursive) != 1L) stop('recursive argument must have length 1.');
all <- as.logical(all);
if (length(all) != 1L) stop('all argument must have length 1.');
## escape and single-quote url, or leave empty for pwd if empty
urlEsc <- if (url == '') '' else paste0('\'',sub("'","'\\''",url),'\'');
## construct ls command with options; identical between local ls and lftp ls
## technically lftp ls doesn't require -l to get a long listing, but it accepts it
lsCmd <- paste0('ls -l',if (recursive) ' -R',if (all) ' -A');
## run system command to get long-listing output lines
if (substr(url,0L,6L) == 'ftp://') { ## ftp
output <- system(paste0('lftp ',urlEsc,' <<<\'',lsCmd,'; exit;\';'),intern=T);
} else { ## local
output <- system(paste0(lsCmd,' ',urlEsc,';'),intern=T);
}; ## end if
## define regexes for parsing the output
## note: accept question marks for items whose metadata cannot be read
sp0RE <- '\\s*';
sp1RE <- '\\s+';
typeRE <- '([?dlcbps-])';
rRE <- '([?r-])';
wRE <- '([?w-])';
xRE <- '([?xsStT-])';
aclRE <- '([?+@]*)';
permRE <- paste0(typeRE,rRE,wRE,xRE,rRE,wRE,xRE,rRE,wRE,xRE,aclRE);
linksRE <- '(\\?|[0-9]+)';
ocRE <- '[a-zA-Z_0-9.$+-]';
ocsRE <- '[a-zA-Z_0-9 .$+-]'; ## badly-behaving names can have spaces; non-greedy will prevent excessive gobbling
ownerRE <- paste0('(\\?|',ocRE,'|',ocRE,ocsRE,'*?',ocRE,')');
groupRE <- ownerRE; ## same compatibility rules as owner
sizeRE <- '(?:\\?|(?:([0-9]+),\\s*)?([0-9]+))'; ## major, minor for special files, plain size for rest
monthRE <- '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
dayRE <- '([0-9]+)';
timeRE <- '([0-9]{2}:[0-9]{2}|[0-9]+)'; ## could be year
dtRE <- paste0('(?:\\?|',monthRE,sp1RE,dayRE,sp1RE,timeRE,')');
nameRE <- '(.*?)'; ## make non-greedy to allow target to be captured, if present
targetRE <- '(?:\\s+->\\s+(.*))?'; ## target is optional; shown on some platforms, e.g. Cygwin
recordRE <- paste0(
'^'
,permRE,sp1RE
,linksRE,sp1RE
,ownerRE,sp1RE
,groupRE,sp1RE
,sizeRE,sp1RE
,dtRE,sp1RE
,nameRE,targetRE ## target is optional; targetRE defines its own whitespace separation
,sp0RE,'$' ## ignore trailing whitespace
);
## get indexes of listing records
recordIndexes <- grep(recordRE,output);
## get indexes of blanks and directory headers for maximally robust matching
blankIndexes <- grep('^\\s*$',output);
headerIndexes <- grep(':$',output); ## questionable specificity
## pare headers down to those with preceding blank
headerIndexes <- headerIndexes[(headerIndexes-1)%in%c(0L,blankIndexes)]; ## include zero for possible first-line header
## match recordIndexes into headerIndexes to look up parent path; direct children will be zero
recordHeaderIndexes <- findInterval(recordIndexes,headerIndexes);
## derive parent paths with trailing slash, or empty string for direct children
parentPaths <- c('',sub(':','/',output[headerIndexes]))[recordHeaderIndexes+1L];
parentPaths <- sub('^\\./','',parentPaths); ## for aesthetics
## match record lines and extract capture groups
reg <- regmatches(output[recordIndexes],regexec(recordRE,output[recordIndexes]));
## build data.frame with reg fields
ret <- data.frame(type=sapply(reg,`[`,2L),stringsAsFactors=F); ## start with type to set the row count
i <- 3L;
## note: size is actually minor for character- and block-special files
for (cn in c('ur','uw','ux','gr','gw','gx','or','ow','ox','acl','links','owner','group','major','size','month','day','time','path','target')) {
ret[[cn]] <- sapply(reg,`[`,i);
i <- i+1L;
}; ## end for
## prepend parent paths to listing paths
ret$path <- paste0(parentPaths,ret$path);
ret;
}; ## end longListing()
这是我在我的系统上创建的特殊文件目录上的演示:
longListing();
## type ur uw ux gr gw gx or ow ox acl links owner group major size month day time path target
## 1 d r w x r - - r - - + 1 user None 0 Feb 27 08:21 dir
## 2 d r w x r w x r w x + 1 user None 0 Feb 27 08:21 dir-other-writable
## 3 d r w x r - - r - T + 1 user None 0 Feb 27 08:21 dir-sticky
## 4 d r w x r w x r w t + 1 user None 0 Feb 27 08:21 dir-sticky-other-writable
## 5 - r w - r - - r - - 2 user None 0 Feb 27 08:21 file
## 6 - r w - r - - r - - 1 user None 0 Feb 27 08:21 file-archive.tar
## 7 - r w - r - - r - - 1 user None 0 Feb 27 08:21 file-audio.mp3
## 8 b r w - r w - r w - 1 user None 0 1 Feb 27 08:21 file-block-special
## 9 c r w - r w - r w - 1 user None 0 1 Feb 27 08:21 file-character-special
## 10 - r w x r w x r w x 1 user None 12 Feb 27 08:21 file-exe
## 11 p r w - r w - r w - 1 user None 0 Feb 27 08:21 file-fifo
## 12 - r w - r - - r - - 1 user None 0 Feb 27 08:21 file-image.bmp
## 13 - r w - r w S r - - 1 user None 0 Feb 27 08:21 file-setgid
## 14 - r w x r w s r - x 1 user None 0 Feb 27 08:21 file-setgid-exe
## 15 - r w S r w - r - - 1 user None 0 Feb 27 08:21 file-setuid
## 16 - r w s r w x r - x 1 user None 0 Feb 27 08:21 file-setuid-exe
## 17 s r w - r w - r - - 1 user None 0 Feb 27 08:21 file-socket
## 18 l r w x r w x r w x 1 user None 4 Feb 27 08:21 ln-existing file
## 19 - r w - r - - r - - 2 user None 0 Feb 27 08:21 ln-hard
## 20 l r w x r w x r w x 1 user None 17 Feb 27 08:21 ln-non-existing file-non-existing
您网站上的演示:
url <- 'ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/hourly/';
ll <- longListing(url,T,T);
ll;
## type ur uw ux gr gw gx or ow ox acl links owner group major size month day time path target
## 1 d r w x r w x - - x 4 32230 ftp-dwd 4096 Jun 5 2014 air_temperature
## 2 d r w x r w x - - x 4 32230 ftp-dwd 4096 Sep 25 2014 cloudiness
## 3 d r w x r w x - - x 4 32230 ftp-dwd 4096 Nov 13 2014 precipitation
## 4 d r w x r w x - - x 4 32230 ftp-dwd 4096 Nov 13 2014 pressure
## 5 d r w x r w x - - x 4 32230 ftp-dwd 4096 Jun 5 2014 soil_temperature
## 6 d r w x r w x - - x 2 32230 ftp-dwd 12288 Dec 15 11:52 solar
## 7 d r w x r w x - - x 4 32230 ftp-dwd 4096 Jun 5 2014 sun
## 8 d r w x r w x - - x 4 32230 ftp-dwd 4096 Apr 17 2015 wind
## 9 d r w x r w x - - x 2 32230 ftp-dwd 114688 Oct 15 12:35 air_temperature/historical
## 10 d r w x r w x - - x 2 32230 ftp-dwd 151552 Dec 4 10:28 air_temperature/recent
## 11 - r w - r w - - - - 1 32230 ftp-dwd 68727 Jan 26 09:55 air_temperature/historical/BESCHREIBUNG_obsgermany_climate_hourly_tu_historical_de.pdf
## 12 - r w - r w - - - - 1 32230 ftp-dwd 68600 Jan 26 09:55 air_temperature/historical/DESCRIPTION_obsgermany_climate_hourly_tu_historical_en.pdf
## 13 - r w - r w - - - - 1 32230 ftp-dwd 123634 Mar 27 2015 air_temperature/historical/TU_Stundenwerte_Beschreibung_Stationen.txt
## 14 - r w - r w - - - - 1 32230 ftp-dwd 2847045 Mar 27 2015 air_temperature/historical/stundenwerte_TU_00003_19500401_20110331_hist.zip
## 15 - r w - r w - - - - 1 32230 ftp-dwd 359517 Mar 27 2015 air_temperature/historical/stundenwerte_TU_00044_20070401_20141231_hist.zip
##
## ... snip ...
##
## 6683 - r w - r w - - - - 1 32230 ftp-dwd 65633 Feb 27 10:26 wind/recent/stundenwerte_FF_15207_akt.zip
## 6684 - r w - r w - - - - 1 32230 ftp-dwd 66910 Feb 27 10:21 wind/recent/stundenwerte_FF_15214_akt.zip
## 6685 - r w - r w - - - - 1 32230 ftp-dwd 64525 Feb 27 10:19 wind/recent/stundenwerte_FF_15444_akt.zip
## 6686 - r w - r w - - - - 1 32230 ftp-dwd 23717 Feb 27 10:21 wind/recent/stundenwerte_FF_15520_akt.zip
您可以轻松地提取zip文件名:
zips <- ll$path[ll$type=='-' & grepl('\\.zip$',ll$path)];
length(zips);
## [1] 6619