如何找到具有某些特定属性的数据集?

时间:2017-09-14 16:10:53

标签: r

软件包data.frame和各种软件包附带了大量有用的数据集,但是当您需要用于包装示例,教学目的或者需要时,似乎没有简单的方法可以找到您的完美数据集。在这里询问/回答问题。

比方说,我想要一个character的数据集,至少有2 caret列,并且长度少于100行。

如何浏览每个可用的数据集并查看最多相关信息以供我选择?

我过去的尝试很混乱,需要时间,并且遇到了一些具有{{1}}等不寻常对象结构的软件包。

4 个答案:

答案 0 :(得分:3)

我重写了@ eddi的kickass回答:

  • 这是一个功能
  • 它不会使工作区混乱(它只是加载data.table
  • 我将列重命名为short并添加了更多
  • 还会检查列表元素的类别
  • 我返回对象和数据集标题
  • 可以隐藏恼人的包裹警告

您只需运行此功能(如果您已安装data.table):

ds <- dataset_summary() # around 5 seconds if you have a lot of packages like me

它适用于曾经存在问题的caret包(请参阅编辑历史记录)

subset(ds,Package == "caret")
#     Package           Object             Item                                                   Title      class nrow ncol char fact ord num int list df
# 143   caret     GermanCredit     GermanCredit                                      German Credit Data data.frame 1000   62    0    1   0  54   7    0  0
# 144   caret       Sacramento       Sacramento                               Sacramento CA Home Prices data.frame  932    9    0    3   0   3   3    0  0
# 145   caret          tecator           absorp          Fat, Water and Protein Content of Meat Samples     matrix  215  100   NA   NA  NA  NA  NA   NA NA
# 146   caret       BloodBrain         bbbDescr                                Blood Brain Barrier Data data.frame  208  134    0    0   0 118  16    0  0
# 147   caret             cars             cars Kelly Blue Book resale data for 2005 model year GM cars data.frame  804   18    0    0   0   1  17    0  0
# 148   caret             cox2        cox2Class                                     COX-2 Activity Data     factor   NA   NA   NA   NA  NA  NA  NA   NA NA

易于子集化并查找特定数据集,会计算list项和data.frame列的类别。

subset(ds,class == 'list' & df > 0,select=-c(2,4))
#           Package           Item class nrow ncol char fact ord num int list df
# 225       ecodist       iris.fit  list   NA   NA    0    0   0   1   0    0  1
# 238 ElemStatLearn  orange10.test  list   NA   NA    0    0   0   0   0    0 50
# 239 ElemStatLearn orange10.train  list   NA   NA    0    0   0   0   0    0 50
# 240 ElemStatLearn   orange4.test  list   NA   NA    0    0   0   0   0    0 50
# 241 ElemStatLearn  orange4.train  list   NA   NA    0    0   0   0   0    0 50
# 346          lava    missingdata  list   NA   NA    0    0   0   0   0    0  4

工作区很干净

ls()
# [1] "dataset_summary" "ds"

除了data.table之外没有任何东西加载。

search()
# [1] ".GlobalEnv"         "package:data.table" "package:Matrix"     "package:sp"         "package:timeSeries" "package:timeDate"  
# [7] "tools:rstudio"      "package:stats"      "package:graphics"   "package:grDevices"  "package:utils"      "package:datasets"  
# [13] "package:methods"    "Autoloads"          "package:base"

功能

dataset_summary <- function(silent = TRUE){
  if(silent){
    w <- options()$warn
    options(warn = -1)
    on.exit(options(warn = w))
  }
  ws <- ls(envir=.GlobalEnv)
  library(data.table)
  dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
  dt = dt[, `:=`(Item   = sub(' \\(.*', '', Item),
                 Object = sub('.*\\((.*)\\)', '\\1', Item))]

  df <- as.data.frame(dt[, { 
    data(list = Object, package = Package)
    d = eval(parse(text = Item))

    classes = if (sum(class(d) %in% c('data.frame','list')) > 0) unlist(lapply(d, class))
    else NA_integer_

    .(class    = paste(class(d), collapse = ","),
      nrow     = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
      ncol     = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
      char     = sum(classes == 'character'),
      fact     = sum(classes == 'factor'),
      ord      = sum(classes == 'ordered'),
      num      = sum(classes == 'numeric'),
      int      = sum(classes == 'integer'),
      list     = sum(classes == 'list'),
      df       = sum(classes == 'data.frame'))
  }
  , by = .(Package, Item)])
  rm(list=setdiff(ls(envir=.GlobalEnv),ws),envir=.GlobalEnv)
  df
} 

答案 1 :(得分:2)

根据自己的喜好进行扩展/修改。

library(data.table)
dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
dt = dt[, `:=`(Item   = sub(' \\(.*', '', Item),
               Object = sub('.*\\((.*)\\)', '\\1', Item))]

dt[, { 
       data(list = Object, package = Package)
       d = eval(parse(text = Item))

       classes = if (sum(class(d) %in% c('data.frame')) > 0) unlist(lapply(d, class))
                 else NA_integer_

       .(class    = paste(class(d), collapse = ","),
         nrow     = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
         ncol     = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
         charCols = sum(classes == 'character'),
         facCols  = sum(classes == 'factor'))
     }
   , by = .(Package, Item)]
#      Package          Item                                               class nrow ncol charCols facCols
#  1: datasets AirPassengers                                                  ts   NA   NA       NA      NA
#  2: datasets       BJsales                                                  ts   NA   NA       NA      NA
#  3: datasets  BJsales.lead                                                  ts   NA   NA       NA      NA
#  4: datasets           BOD                                          data.frame    6    2        0       0
#  5: datasets           CO2 nfnGroupedData,nfGroupedData,groupedData,data.frame   84    5        0       3
# ---                                                                                                      
#492: survival    transplant                                          data.frame  815    6        0       3
#493: survival        uspop2                                               array  101    2       NA      NA
#494: survival       veteran                                          data.frame  137    8        0       1
#495:  viridis   viridis.map                                          data.frame 1024    4        1       0
#496:   xtable           tli                                          data.frame  100    5        0       3

答案 2 :(得分:1)

在包datasets中,没有类data.frame的数据集可以满足您的条件,更确切地说,如果它们属于类data.frame并且最多有100列,那么它们都没有两列或更多列character列。我刚刚发现以下代码的第一个版本。

library(datasets)
res <- library(help = "datasets")

dat <- unlist(lapply(strsplit(res$info[[2]], " "), '[[', 1))
dat <- dat[dat != ""]
df_names <- NULL
for(i in seq_along(dat)){
    d <- tryCatch(get(dat[i]), error = function(e) e)
    if(inherits(d, "data.frame")){
        if(nrow(d) <= 100){
            char <- sum(sapply(d, is.character))
            fact <- sum(sapply(d, is.factor))
            if(char >= 2 || fact >= 2){
                print(dat[i])
                df_names <- c(df_names, dat[i])
            }
        }
    }
}

df_names
[1] "CO2"        "esoph"      "npk"        "sleep"      "warpbreaks"

所以我必须包含额外的指令来处理类factor的列。默认情况下,数据框是使用stringsAsFactors = TRUE创建的。如果你可以使用它,那么它们的名字就在向量df_names中。为了使它们在全球环境中可用get您想要的那个。

答案 3 :(得分:1)

myfun()返回的表可以使用适当的条件进行过滤,数据集的列可以通过类coulmn中给出的类来标识。

caret包的问题在于它没有任何数据框或矩阵对象。数据集可以存在于列表对象内的caret中。我不确定,caret包中的一些列表对象包含一系列函数。

此外,如果感兴趣,您可以使myfun()函数更具体,仅用于返回有关数据框或矩阵对象的信息。

myfun <- function( package )
{
  t( sapply( ls( paste0( 'package:', package ) ), function(x){
    y <- eval(parse(text = paste0( package, "::`", x, "`")))
    data.frame( data_class = paste0(class(y), collapse = ","), 
                nrow = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                               nrow(y), 
                               NA_integer_ ),
                ncol = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                               ncol(y),
                               NA_integer_),
                classes = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                                  paste0( unlist(lapply(y, class)), collapse = "," ),
                                  NA),
                stringsAsFactors = FALSE )

  } ) )
}

library( datasets )
meta_data <- myfun( package = "datasets")
head(meta_data)
#               data_class   nrow ncol classes                                                          
# ability.cov   "list"       NA   NA   NA                                                               
# airmiles      "ts"         NA   NA   NA                                                               
# AirPassengers "ts"         NA   NA   NA                                                               
# airquality    "data.frame" 153  6    "integer,integer,numeric,integer,integer,integer"                
# anscombe      "data.frame" 11   8    "numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric"
# attenu        "data.frame" 182  5    "numeric,numeric,factor,numeric,numeric"  

meta_data[ "ChickWeight", ]
# $data_class
# [1] "nfnGroupedData,nfGroupedData,groupedData,data.frame"
# 
# $nrow
# [1] 578
# 
# $ncol
# [1] 4
# 
# $classes
# [1] "numeric,numeric,ordered,factor,factor"

library( 'caret' )
meta_data <- myfun( package = "caret")
#               data_class nrow ncol classes
# anovaScores   "function" NA   NA   NA     
# avNNet        "function" NA   NA   NA     
# bag           "function" NA   NA   NA     
# bagControl    "function" NA   NA   NA     
# bagEarth      "function" NA   NA   NA     
# bagEarthStats "function" NA   NA   NA 

如果在包上应用myfun()函数后需要卸载已加载的包,请尝试以下操作:

loaded_pkgs <- search()
library( 'caret' )
meta_data <- myfun( package = "caret")
unload_pkgs <- setdiff( search(), loaded_pkgs )
for( i in unload_pkgs ) { 
  detach( pos = which( search() %in% i ) ) 
}