当我通过JDBC将R Script中的记录插入Teradata数据库时,我遇到了性能问题。对于100.000条记录大约需要100秒,考虑到我必须插入大约1000万条记录,这是非常缓慢的。
有人知道如何改善插页的效果吗?
我按以下方式设置连接:
options(java.parameters = "-Xmx8048m")
options("encoding")
library(RJDBC)
.jinit()
# path to the JDBC driver:
path.jdbcdriver <- paste0(support_folder, 'jdbc/')
cat("\n JDBC driver Path:", path.jdbcdriver)
class.jdbcdriver <- paste0(path.jdbcdriver, c('terajdbc4.jar', 'tdgssconfig.jar'), collapse=';')
# on a unix-like OS the class path needs to be modified
.jaddClassPath(paste0(path.jdbcdriver, 'terajdbc4.jar'))
.jaddClassPath(paste0(path.jdbcdriver, 'tdgssconfig.jar'))
#.jclassPath() # check class path
# extra parameters for JDBC driver connection
params.jdbc <- 'FINALIZE_AUTO_CLOSE=ON,TMODE=DEFAULT, LOB_SUPPORT=OFF, CHARSET=UTF8, rewriteBatchedStatements=true'
# create driver object
driver <- JDBC(driverClass = 'com.teradata.jdbc.TeraDriver', classPath = class.jdbcdriver)
之后,我正在使用准备好的声明。如下所示,我正在创建具有10.000条记录的批次并提交它们。 如果我尝试将批量大小增加到例如50.000,则会失败,并且会引发内存错误异常。
insert语句的输入来自data.table,我使用此data.table中的3列:myinsert(df [n,1],df [n,11],as.numeric(df) [n,7]))
如果出现异常,我只打印出第一个例外,因为否则会打印出整批的例外情况,即10,000条记录。这就是我在异常处理中注释掉while语句的原因。
target_db <- paste0("database ", database)
myinsert <-
function(arg1,arg11,arg7){
.jcall(ps,"V","setInt",as.integer(1),as.integer(arg1))
.jcall(ps,"V","setInt",as.integer(2),as.integer(arg11))
.jcall(ps,"V","setDouble",as.integer(3),arg7)
.jcall(ps,"V","addBatch")
}
#conn <- dbConnect(driver, paste0('jdbc:teradata://', td.ip, '/', params.jdbc,', ','TYPE=FASTLOAD'), td.uid, td.pwd)
conn <- dbConnect(driver, paste0('jdbc:teradata://', td.ip, '/', params.jdbc), td.uid, td.pwd)
##prepare
insert_stmt <- paste0("insert into ",database, ".table_name_x values(?,","2",",","?",",","2",",",reporting_year,",",knowledge_year,",",bag_view_id,",","-11",",","?",",",delivery_type_id,",","-11",",",version_id,")")
#,?,?,?,?,?,?,?,?
ps = .jcall(conn@jc,"Ljava/sql/PreparedStatement;","prepareStatement",insert_stmt)
#apply & commit
batchsize = 100
cat("\n# of records to be loaded: ", nrow(df))
cat("\n* batch creation is running ...\n")
if (nrow(df) >= batchsize) {
for(n in 1:nrow(df)) {
myinsert(df[n,1],df[n,11],as.numeric(df[n,7]))
if (((n %% 10000) == 0) || (n == nrow(df))) {
if (((n %% 100000) == 0) || (n == nrow(df))) {
#cat("\n")
print(paste0("Execute Batch ", n, "; Time: ", Sys.time()))
}
.jcall(ps,"[I","executeBatch", check=FALSE)
ex = .jgetEx() # save exceptions from PreparedStatement.executeBatch()
.jclear() # clear all pending exceptions
if (!is.jnull(ex)) {
#while (!is.jnull(ex)) { # loop thru chained exceptions
sw = .jnew("java/io/StringWriter")
pw = .jnew("java/io/PrintWriter",.jcast(sw, "java/io/Writer"),TRUE)
.jcall(ex,"V","printStackTrace",pw) # redirect printStackTrace to a Java PrintWriter so it can be printed in Rterm AND Rgui
if (ex %instanceof% "java.sql.BatchUpdateException") {
print(.jcall(ex,"[I","getUpdateCounts")) # print int[] update count showing 3 rows inserted successfully (1) and 2 rows failed to insert (-3)
}
cat(.jcall(sw,"Ljava/lang/String;","toString")) # print the error message and stack trace
if (ex %instanceof% "java.sql.SQLException") {
ex = ex$getNextException()
} else {
ex = ex$getCause()
}
#}
# capture chained JDBC SQLWarning messages and stack trace from Connection.rollback()
.jcall(conn, "V", "rollback")
w = .jcall(conn, "Ljava/sql/SQLWarning;", "getWarnings") # save warnings from Connection.rollback()
while (!is.jnull(w)) { # loop thru chained warnings
sw = .jnew("java/io/StringWriter")
pw = .jnew("java/io/PrintWriter",.jcast(sw, "java/io/Writer"),TRUE)
.jcall(w,"V","printStackTrace",pw) # redirect printStackTrace to a Java PrintWriter so it can be printed in Rterm AND Rgui
cat(.jcall(sw,"Ljava/lang/String;","toString")) # print the warning message and stack trace
w = w$getNextWarning()
}
}
else {
.jcall(conn@jc, "V", "commit")
#cat("\ncommiting...")
}
}
}
#})
}
dbCommit(conn)
.jcall(ps,"V","close")
cat("\nDatabase commit finished"
我使用参数rewriteBatchedStatements = true,但实际上它没有帮助,因为Teradata JDBC中没有这样的JDBC参数。
我无法使用参数FASTLOAD,因为该表不为空