如何遍历所有数据集(并确定其列数)?

时间:2015-08-17 16:57:43

标签: r

我想循环遍历所有可用(=已安装)软件包的数据集 找出这些数据集是否有6列或更多列。这是我的试用版:

dat.list <- data(package=.packages(all.available=TRUE))$results # list of all installed packages
colnames(dat.list) # "Package" "LibPath" "Item" (= name of data set) "Title" (= description)
idx <- c()
i <- 3
## for(i in nrow(dat.list)) {
    nme <- dat.list[[i,"Item"]] # data set as string
    data(list=nme, package=dat.list[[i,"Package"]]) # load the data
    ## => fails with warning: In data(list = nme, package = dat.list[[i, "Package"]]) :
    ##    data set 'BJsales.lead (BJsales)' not found
    dat <- eval(as.name(nme)) # assign the data to the variable dat
    ncl <- ncol(dat)
    if(!is.null(ncl) && ncl >= 6) idx <- c(idx, i)
## }
显然,这很明显 不起作用,所以我修改了一个索引(这里:3)以查看失败的原因。如何(如果不是通过上面nme)我可以确定数据集的名称,以便将数据集存储在变量中,然后访问其列数?

更新 结合jeremycg和nico的帖子,我提出了这个问题(同样:在弄清楚数据集的名称时并不完美)但是它已经完成了:

dat.list <- data(package=.packages(all.available=TRUE))$results # list of all installed packages
idx <- c()
for (i in 1:nrow(dat.list))
{
    require(dat.list[i, "Package"], character.only=TRUE)
    raw.name <- dat.list[i, "Item"] # data set (and parenthetical suffix) as raw string
    name <- gsub('\\s.*','', raw.name) # name of data set
    dat <- tryCatch(get(name), error=function(e) e) # assign the data to the variable dat (if not erroneous)
    if(is(dat, "simpleError")) {
        warning("Element ",i," threw an error")
        dat <- NA
    }
    ncl <- ncol(dat)
    if(!is.null(ncl) && ncl >= 6)
        idx <- c(idx, i)
}
dat.list[idx, c("Package", "Item")]

1 个答案:

答案 0 :(得分:2)

我猜你需要加载包来访问数据。

所以你需要在循环的开头添加:

require(dat.list[[i, "Package"]], character.only = TRUE)

(请参阅this question了解您需要使用charachter.only变量的原因)

请注意,您还需要更改循环:

for(i in nrow(dat.list))

for(i in 1:nrow(dat.list))

还有另一个问题:返回一些数据集,其名称也在括号中。例如:

wine.classes (wine)

所以我们需要剥离它们。使用:

轻松完成
dat.list[,3] <- sapply(strsplit(dat.list[,3], " "), function(x){x[1]})

最后,dat.list可以使用[]进行访问,无需[[]](更易于阅读!)。

所以,最后:

# List of all installed packages
dat.list <- data(package=.packages(all.available=TRUE))$results

# Remove package name in parentheses
dat.list[,3] <- sapply(strsplit(dat.list[, "Item"], " "), 
      function(x){x[1]})

idx <- c()
for (i in 1:nrow(dat.list)) 
    {
    require(dat.list[i, "Package"], character.only = T)
    nme <- dat.list[i,"Item"] # data set as string
    data(list=nme, package=dat.list[i,"Package"]) # load the data

    dat <- eval(as.name(nme)) # assign the data to the variable dat
    ncl <- ncol(dat)
    if(!is.null(ncl) && ncl >= 6)
        idx <- c(idx, i)
    }

> dat.list[idx, "Item"]
 [1] "Seatbelts"          "USJudgeRatings"     "WorldPhones"        "airquality"        
 [5] "anscombe"           "attitude"           "crimtab"            "euro.cross"        
 [9] "infert"             "longley"            "mtcars"             "occupationalStatus"
[13] "state.x77"          "swiss"              "volcano"            "car.test.frame"    
[17] "car90"              "solder"             "stagec"             "bladder"           
[21] "bladder1"           "bladder2"           "cancer"             "cgd"               
[25] "cgd0"               "colon"              "flchain"            "heart"             
[29] "jasa"               "jasa1"              "kidney"             "lung"              
[33] "mgus"               "mgus1"              "mgus2"              "nwtco"             
[37] "ovarian"            "pbc"                "pbcseq"             "rats2"             
[41] "transplant"         "veteran"            "soldat"             "patch"             
[45] "tooth"