FUN错误(X [[2L]],...):对不起,参数类型为“NA'不明确或不受支持

时间:2014-08-11 06:59:39

标签: r hadoop rjava cloudera-cdh rhadoop

我正在尝试使用以下R脚本在位于“hdfs://:/ somnath / merged_train / part-m-00000”的HDFS数据文件上使用RHadoop(rmr2,rhdfs软件包)构建逻辑回归模型,然后进行测试在“hdfs://:/ somnath / merged_test / part-m-00000”中使用测试HDFS数据文件的模型。

我们正在使用CDH4分发,其中Yarn / MR2与Hadoop-0.20支持的MR1并行运行。并使用hadoop-0.20 mapreduce和hdfs版本以下面显示的Sys.setenv命令运行下面的RHadoop脚本。

但是,每当我运行脚本时,我都会遇到以下错误而几乎没有运气绕过它。如果有人指出这个错误的可能原因,我会很感激,这似乎是由于在没有处理NA参数的情况下在R中进行lapply调用的错误方式。

[root@kkws029 logreg_template]# Rscript logreg_test.R
Loading required package: methods
Loading required package: rJava

HADOOP_CMD=/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/bin/hadoop

Be sure to run hdfs.init()
14/08/11 11:59:30 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
NULL
NULL
[1] "Starting to build logistic regression model..."
Error in FUN(X[[2L]], ...) :
  Sorry, parameter type `NA' is ambiguous or not supported.
Calls: logistic.regression ... .jrcall -> ._java_valid_objects_list -> lapply -> FUN
Execution halted

以下是我的R脚本:

#!/usr/bin/env Rscript


Sys.setenv(HADOOP_HOME="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce")
Sys.setenv(HADOOP_CMD="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/bin/hadoop")


Sys.setenv(HADOOP_BIN="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/bin");
Sys.setenv(HADOOP_CONF_DIR="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/conf");
Sys.setenv(HADOOP_STREAMING="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.3.0.jar")
Sys.setenv(LD_LIBRARY_PATH="/usr/lib64/R/library/rJava/jri")


library(rmr2)
library(rhdfs)

.jinit()
.jaddClassPath("/opt/cloudera/parcels/CDH/lib/hadoop/hadoop-auth-2.0.0-cdh4.3.0.jar")
.jaddClassPath("/opt/cloudera/parcels/CDH/lib/hadoop-hdfs/hadoop-hdfs-2.0.0-cdh4.3.0.jar")
.jaddClassPath("/opt/cloudera/parcels/CDH/lib/hadoop/hadoop-common-2.0.0-cdh4.3.0.jar")


hdfs.init()
rmr.options( backend = "hadoop", hdfs.tempdir = "/tmp" )

logistic.regression =
        function(hdfsFilePath, iterations, dims, alpha) {
                r.file <- hdfs.file(hdfsFilePath,"r")

                #hdfsFilePath <- to.dfs(hdfsFilePath)

                lr.map =
                  function(.,M) {
                    Y = M[,1]
                    X = M[,-1]
                    keyval(
                        1,
                        Y * X *
                          g(-Y * as.numeric(X %*% t(plane))))}

                lr.reduce =
                  function(k, Z)
                    keyval(k, t(as.matrix(apply(Z,2,sum))))

                plane = t(rep(0, dims))
                g = function(z) 1/(1 + exp(-z))
                for (i in 1:iterations) {
                        gradient =
                                values(
                                        from.dfs(
                                          mapreduce(
                                            input = as.matrix(hdfs.read.text.file(r.file)),
                                            #input = from.dfs(hdfsFilePath),
                                            map = function(.,M) {
                                                Y = M[,1]
                                                X = M[,-1]
                                                keyval(
                                                        1,
                                                        Y * X *
                                                                g(-Y * as.numeric(X %*% t(plane))))},
                                            reduce = lr.reduce,
                                            combine = T)))
                        plane = plane + alpha * gradient

                        #trace(print(plane),quote(browser()))
                 }
                return(plane) }

#validate logistic regression
logistic.regression.test =
        function(hdfsFilePath, weight) {
                r.file <- hdfs.file(hdfsFilePath,"r")
                lr.test.map =
                        function(.,M) {
                          keyval(
                             1,
                             lapply(as.numeric(M[,-1] %*% t(weight)),function(z) 1/(1 + exp(-z))))}

                probabilities =
                     values(
                             from.dfs(
                                mapreduce(
                                  input = as.matrix(hdfs.read.text.file(r.file)),
                                  map = function(.,M) {
                                        keyval(
                                                1,
                                                lapply(as.numeric(M[,-1] %*% t(weight)), function(z) 1/(1 + exp(-z))))}
                        )))
        return(probabilities) }

out = list()
prob = list()
rmr.options( backend = "hadoop", hdfs.tempdir = "/tmp" )

print("Starting to build logistic regression model...")

  out[['hadoop']] =
## @knitr logistic.regression-run
    logistic.regression(
       "hdfs://XX.XX.XX.XX:NNNN/somnath/merged_train/part-m-00000", 5, 5, 0.05)
  write.csv(as.vector(out[['hadoop']]), "/root/somnath/logreg_data/weights.csv")


print("Building logistic regression model completed.")


  prob[['hadoop']] =
    logistic.regression.test(
       "hdfs://XX.XX.XX.XX:NNNN/somnath/merged_test/part-m-00000", out[['hadoop']])
  write.csv(as.vector(prob[['hadoop']]), "/root/somnath/logreg_data/probabilities.csv")


stopifnot(
  isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7)))

注意:我已在root~ / .bash_profile中为HADOOP设置了以下环境变量,如下所示

# Hadoop-specific environment and commands

export HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce
export HADOOP2_HOME=/opt/cloudera/parcels/CDH/lib/hadoop
#export HADOOP_CMD=${HADOOP_HOME}/bin/hadoop
#export HADOOP_STREAMING=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar
#export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop

export LD_LIBRARY_PATH=${R_HOME}/library/rJava/jri #:${HADOOP_HOME}/../hadoop-0.20-mapreduce/lib/native/Linux-amd64-64


# Add hadoop-common jar to classpath for PlatformName and FsShell classes; Add hadoop-auth and hadoop-hdfs jars

export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:${HADOOP2_HOME}/client-0.20/* #:${HADOOP_HOME}/*.jar:${HADOOP_HOME}/lib/*.jar:${HADOOP2_HOME}/hadoop-common-2.0.0-cdh4.3.0.jar:${HADOOP_HOME}/../hadoop-hdfs/hadoop-hdfs-2.0.0-cdh4.3.0.jar:${HADOOP_HOME}/hadoop-auth-2.0.0-cdh4.3.0.jar:$HADOOP_STREAMING

PATH=$PATH:$R_HOME/bin:$JAVA_HOME/bin:$LD_LIBRARY_PATH:/opt/cloudera/parcels/CDH/lib/mahout:/opt/cloudera/parcels/CDH/lib/hadoop:/opt/cloudera/parcels/CDH/lib/hadoop-hdfs:/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce:/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce:/var/lib/storm-0.9.0-rc2/lib #:$HADOOP_CMD:$HADOOP_STREAMING:$HADOOP_CONF_DIR

export PATH

示例火车数据集

0,-4.418,-2.0658,1.2193,-0.68097,0.90894
0,-2.7466,-2.9374,-0.87562,-0.65177,0.53182
0,-0.98846,0.66962,-0.20736,-0.2895,0.002313
0,-2.277,2.492,0.47936,0.4673,-1.5075
0,-5.4391,1.8447,-1.6843,1.465,-0.71099
0,-0.12843,0.066968,0.02678,-0.040851,0.0075902
0,-2.0796,2.4739,0.23472,0.86423,0.45094
0,-3.1796,-0.15429,1.4814,-0.94316,-0.52754
0,-1.9429,1.3111,0.31921,-1.202,0.8552
0,-2.3768,1.9301,0.096005,-0.51971,-0.17544
0,-2.0336,1.991,0.82029,0.018232,-0.33222
0,-3.6388,-3.2903,-2.1076,0.73341,0.75986
0,-2.9146,0.53163,0.49182,-0.38562,-0.76436
0,-3.3816,1.0954,0.25552,-0.11564,-0.01912
0,-1.7374,-0.63031,-0.6122,0.022664,0.23399
0,-1.312,-0.54935,-0.68508,-0.072985,0.036481
0,-3.991,0.55278,0.38666,-0.56128,-0.6748
....

样本测试数据集

0,-0.66666,0.21439,0.041861,-0.12996,-0.36305
0,-1.3412,-1.1629,-0.029398,-0.13513,0.49758
0,-2.6776,-0.40194,-0.97336,-1.3355,0.73202
0,-6.0203,-0.61477,1.5248,1.9967,2.697
0,-4.5663,-1.6632,-1.2893,-1.7972,1.4367
0,-7.2339,2.4589,0.61349,0.39094,2.19
0,-4.5683,-1.3066,1.1006,-2.8084,0.3172
0,-4.1223,-1.5059,1.3063,-0.18935,1.177
0,-3.7135,-0.26283,1.6961,-1.3499,-0.18553
0,-2.7993,1.2308,-0.42244,-0.50713,-0.3522
0,-3.0541,1.8173,0.96789,-0.25138,-0.36246
0,-1.1798,1.0478,-0.29168,-0.26261,-0.21527
0,-2.6459,2.9387,0.14833,0.24159,-2.4811
0,-3.1672,2.479,-1.2103,-0.48726,0.30974
1,-0.90706,1.0157,0.32953,-0.11648,-0.47386
...

0 个答案:

没有答案