当添加具有较少列时,附加到现有SQLite表,而不将数据库读入R

时间:2013-11-09 15:33:07

标签: sql r sqlite rsqlite

是否有一些简单的方法,无论是在SQL端还是在R端,都可以将data.frame附加到具有更多列的现有表中?缺少的列应该只用NA填充。如果它优雅地处理具有比表1更多列的表2,则可以获得奖励积分吗?

library(RSQLite)

# Create
db <- dbConnect( SQLite(), dbname="~/temp/test.sqlite" )

# Write test
set.seed(1)
n <- 1000
testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
dbWriteTable( conn = db, name = "test", value = testDat, row.names=FALSE )
testDat2 <- data.frame( key=seq(n+1,n+100), x=runif(100) )
> dbWriteTable( conn = db, name="test", value = testDat2, row.names=FALSE, append=TRUE  )
[1] FALSE
Warning message:
In value[[3L]](cond) :
  RS-DBI driver: (error in statement: table test has 6 columns but 2 values were supplied)

我也可以设想一个包装器。该算法看起来像:

  1. 从现有SQL表中读取1行。
  2. 从该读取中获取列名。
  3. 使用未包含的名称向data.frame添加列名;充满了失误。
  4. dbWriteTable现在data.frame与SQL表具有相同的列。
  5. SQLite具有可变类这一事实简化了这一点。但如果它已经存在,我宁愿不重新发明轮子。

    修改

    简单说明一下:这些数据集很大。 SQL数据库大约为30GB,而data.frame(实际上是data.table,显而易见的原因)约为4GB。因此,需要将SQL表读入R的解决方案是非启动程序。

    另一种算法是在SQL中执行:

    1. 将R data.frame写入临时SQL表。
    2. SQL魔术将该表附加到主SQL表上。
    3. 删除临时SQL表。
    4. 利润。

3 个答案:

答案 0 :(得分:1)

一种解决方案是使用dbSendQuery以1000行的形式读取一个表,并将其添加到另一个表中(添加必需的列)。

res <- dbSendQuery(con, "SELECT * from tests")
while(!dbHasCompleted(res)) {
  data <- fetch(res, n = 1000);
  ### Put the data in the other table
}

还有一种方法可以在单个SQLite查询中执行此操作。如果您知道要添加哪些列(填充NULL),SQL查询将如下所示:

INSERT INTO target_table SELECT col1,NULL,col2,col3,NULL,NULL,col4 FROM source_table

答案 1 :(得分:0)

来自rbind.fill

plyr提供了一种从R中进行此连接的好方法:

require(plyr)
X <- rbind.fill(testDat, stDat2)

不确定这完全回答了你的问题,因为看起来你想在连接端做追加。

答案 2 :(得分:0)

解决方案也涉及因素:

#' Function to return column names from a SQLite database
#' @param conn An RSQLite connection to a database
#' @param name Character string giving the name of the table you want column names for
#' @export dbGetColnames
#' @return Character vector of column names
dbGetColnames <- function(conn, name) {
  x <- dbGetQuery( conn, paste0("SELECT sql FROM sqlite_master WHERE tbl_name = '",name,"' AND type = 'table'") )[1,1]
  x <- sub( "^.*\\((.+)\\).*$", "\\1", x )
  x <- str_split(x,",")[[1]]
  x <- gsub('[\t\n"]','', x)
  x <- gsub('^ *','', x)
  vapply( str_split( x ," " ), first, "" )
}

#' Write a table via RSQLite with factors stored in another table
#' Handles data.tables efficiently for large datasets
#' @param conn The connection object (created with e.g. dbConnect)
#' @param name The name of the table to write
#' @param value The data.frame to write to the database
#' @param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will create a table called mytable_factor_color which will store the levels information)
#' @param append a logical specifying whether to append to an existing table in the DBMS.
#' @param \dots Options to pass along to dbWriteTable (e.g. append=TRUE)
#' @return A boolean indicating whether the table write was successful
#' @export dbWriteFactorTable
#' @examples
#' library(RSQLite)
#' load_all( file.path(.db,"R-projects","taRifx") )
# Create
#' dbFilename <- tempfile()
#' db <- dbConnect( SQLite(), dbname=dbFilename )
# Write test
#' set.seed(1)
#' n <- 1000
#' testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' dbWriteTable( conn = db, name = "test", value = testDat, row.names=FALSE )
#' testDat2 <- data.frame( key=seq(n+1,n+100), x=runif(100) )
#' dbWriteTable( conn = db, name="test", value = testDat2, row.names=FALSE, append=TRUE  )
# Read test
#' testRecovery <- dbGetQuery(db, "SELECT * FROM test")
#' testSelection <- dbGetQuery(db, "SELECT * FROM test WHERE g3=='h' OR g3=='e' ")
#' testSelection
# Test removing rows matching criteria
#' for(i in 1:10) dbWriteTable( conn = db, name = "test", value = testDat, row.names=FALSE, append=TRUE )
#' dbSendQuery( db, "DELETE FROM test WHERE g3=='a'" )
#' # Test factor conversion
#' testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' if(dbExistsTable(db,"test_factor_g3")) dbRemoveTable(db,"test_factor_g3")
#' dbWriteFactorTable( conn = db, name = "test", value = testDat, row.names=FALSE )
#' dbGetQuery(db, "SELECT * FROM test")
#' dbGetQuery(db, "SELECT * FROM test_factor_g3")
#' testDat$g3 <- factor( sample(letters[6:15],n,replace=TRUE) )
#' dbWriteFactorTable( conn = db, name = "test", value = testDat, row.names=FALSE, append=TRUE )
#' dbGetQuery(db, "SELECT * FROM test_factor_g3")
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' dbWriteFactorTable( conn = db, name = "test", value = as.data.table(testDat), row.names=FALSE )
#' dbReadFactorTable( conn = db, name = "test" )
#' dbReadFactorTable( conn = db, name = "test", query="WHERE g3=='a'" )
#' # -- Test merging of tables where the columns don't line up -- #
#' set.seed(1)
#' n <- 1000
#' testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
#' if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
#' dbWriteFactorTable( conn = db, name = "test", value = testDat, row.names=FALSE )
#' dbGetQuery( db, "SELECT * FROM test" )
#' # Add a table with columns that are a subset of the SQL table
#' testDat2 <- data.frame( key=seq(n+1,n+100), y=runif(100) )
#' dbWriteFactorTable( conn = db, name="test", value = testDat2, row.names=FALSE, append=TRUE  )
#' dbGetQuery( db, "SELECT * FROM test" )
#' # Add a table where the columns are a superset of the SQL table's
#' testDat3 <- data.frame( key=seq(n+101,n+200), x=runif(100), n=runif(100) )
#' dbWriteFactorTable( conn = db, name="test", value = testDat3, row.names=FALSE, append=TRUE  )
#' dbGetQuery( db, "SELECT * FROM test" )
#' # Finish up
#' dbDisconnect(db) # close connection
#' unlink( dbFilename ) # delete tempfile
dbWriteFactorTable <- function( conn, name, value, factorName="_factor_", append=FALSE, ... ) {
  require(RSQLite)
  # Test inputs
  stopifnot(class(conn)=="SQLiteConnection")
  stopifnot(class(name)=="character")
  stopifnot("data.frame" %in% class(value))
  stopifnot(class(factorName)=="character")
  if( grepl("[.]",factorName) ) stop("factorName must use valid characters for SQLite")
  if( "data.table" %in% class(value) )  {
    dt <- TRUE # Is value a data.table, if so use more efficient methods
  } else {
    dt <- FALSE
  }
  # Convert factors to character
  factorCols <- names( Filter( function(x) x=="factor", vapply( value, class, "" ) ) )
  if(length(factorCols>0)) {
    for( cl in which( colnames(value) %in% factorCols ) ) {
      cn <- colnames(value)[cl]
      factorTable <- data.frame( levels=levels(value[[ cn ]]) )
      factorTable$levelKey <- seq(nrow(factorTable))
      fctNm <- paste0(name,factorName,cn)
      fctTableExists <- dbExistsTable( conn = conn , name = fctNm)
      # Write out the factor table
      if( append & fctTableExists ) {
        oldFactorTable <- dbGetQuery( conn = conn, paste("SELECT levelKey, levels FROM",fctNm) )
        levelExists <- factorTable$levels %in% oldFactorTable$levels
        if(!all(levelExists)) {
          startLevelKey <- max( oldFactorTable$levelKey ) + 1
          addLevels <- factorTable$levels[!levelExists]
          newFactorTable <- data.frame( 
            levels = addLevels,
            levelKey = seq( startLevelKey, startLevelKey + length(addLevels) - 1 )
          )
          dbWriteTable( conn = conn, name = fctNm, value = newFactorTable, row.names = FALSE, append = TRUE )
        } # If all levels exist, don't update the table -- go straight to converting the factor to character
      } else {
        if(fctTableExists) {
          warning(paste("Append set to FALSE but the factor table named",fctNm,"exists. Deleting."))
          dbRemoveTable( conn=conn, name = fctNm )
        }
        dbWriteTable( conn = conn, name = fctNm, value = factorTable, row.names = FALSE )
      }
      # Convert variable cl to character in the main data.frame (value) that we'll write to the main SQL table
      if( dt )  set( x=value, j=cl, value=as.character(value[[ cn ]]) )
    }
    if( !dt )  value <- japply( value, which( colnames(value) %in% factorCols ), as.character )
  } else {
    #warning("No factor columns detected.")
  }
  if( append ) {
    # If we're appending, check that the number of columns of the new table is equal to the number of columns of the old table
    # Only run this code if we're appending, because otherwise the table won't exist
    sqlColnames <- dbGetColnames( conn, name )
    colnamesSubset <- !all( sqlColnames %in% colnames(value) )
    colnamesSuperset <- !all( colnames(value) %in% sqlColnames )
    if( colnamesSuperset ) {
      addCols <- colnames(value)[ !colnames(value) %in% sqlColnames ]
      for( ac in addCols ) {
        warning(paste("Adding column",ac,"to SQL table"))
        dbSendQuery( conn,
                     paste(
                       "ALTER TABLE",
                       name,
                       "ADD COLUMN",
                       ac,
                       "DEFAULT NULL"
                     )
        )
      }
    } # If it's a superset but not a subset, then we're done (allow it to return back to the second if where it just writes value directly)
    if( colnamesSubset ) {
      # Write our database to a temporary table
      tempTableName <- "temp_dbWriteFactorTable"
      if(dbExistsTable(conn,tempTableName))  dbRemoveTable(conn,tempTableName)
      dbWriteTable( conn = conn, name=tempTableName, value = value, row.names=FALSE, append=FALSE  )
      # Add any columns to input data.frame that are in target table, then merge
      sqlColnames <- dbGetColnames( conn, name ) # Reset these now that we've possibly tinkered with them in the superset section
      dfColnames <- sqlColnames
      dfColnames[ !sqlColnames %in% colnames(value) ] <- "null"
      status <- dbSendQuery( conn, 
                             paste( 
                               "INSERT INTO", name, 
                               "(",paste(sqlColnames,collapse=","),")",
                               "SELECT",
                               paste( dfColnames, collapse="," ),
                               "FROM",
                               tempTableName
                             )
      )
      # Remove temporary table
      dbRemoveTable(conn,tempTableName)
    }
  } 
  if( !append || (append & !colnamesSubset) ) { # Either we're not appending, or the columns in the input and target tables exactly match (possibly after we added columns with the superset code)
    status <- dbWriteTable( conn = conn, name = name, value = value, append=append, ... )
  }
  return( status )
}

#' Read a table via RSQLite with factors stored in another table
#' @param conn The connection object (created with e.g. dbConnect)
#' @param name The name of the table to read
#' @param query A character string containing sequel statements to be appended onto the query (e.g. "WHERE x==3")
#' @param dt Whether to return a data.table vs. a plain-old data.frame
#' @param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will expect there to be a table called mytable_factor_color which holds the levels information)
#' @param \dots Options to pass along to dbGetQuery
#' @return A data.table or data.frame
#' @export dbReadFactorTable
dbReadFactorTable <- function( conn, name, query="", dt=TRUE, factorName="_factor_", ... ) {
  require(RSQLite)
  # Test inputs
  stopifnot(class(conn)=="SQLiteConnection")
  stopifnot(class(name)=="character")
  stopifnot(class(factorName)=="character")
  if( grepl("[.]",factorName) ) stop("factorName must use valid characters for SQLite")
  # Read main table
  if( dt ) {
    value <- as.data.table( dbGetQuery( conn, paste("SELECT * FROM",name,query), ... ) )
  } else {
    value <- dbGetQuery( conn, paste("SELECT * FROM",name,query), ... )
  }
  # Convert factors to character
  factorCols <- sub( paste0("^.*",name,factorName,"(.+)$"), "\\1", 
                     Filter( Negate(is.na), 
                             str_extract( dbListTables( conn ), paste0(".*",name,factorName,".*") ) 
                     )
  )
  if( length(factorCols>0) ) {
    for( cn in factorCols ) {
      fctNm <- paste0(name,factorName,cn)
      factorTable <- dbGetQuery( conn, paste0("SELECT * FROM ",fctNm) )
      factorLevels <- factorTable$levels[ order( factorTable$levelKey ) ] # sort by levelKey so we maintain a consistent reference category (SQL databases don't guarantee the row order remains the same)
      if( dt ) {
        cl <- which( colnames(value) %in% cn )
        set( x=value, j=cl, value=factor( value[[ cn ]], levels=factorLevels ) )
      } else {
        value[[ cn ]] <- factor( value[[ cn ]], levels=factorLevels )
      }
    }
  } else {
    #warning("No factor columns detected.")
  }
  value
}

我怀疑,这会在某个时刻出现在taRifx。我为解决这个问题而添加的部分是if(colnamesSubset)块。