Question

我想在Eclipse上试验Spark ML，我必须首先执行一些数据操作。下面的代码显示了后者。

我的代码：

package org.test.spark


import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.ml.{ Pipeline, PipelineStage }


import org.apache.spark.rdd.RDD

import org.apache.spark.sql._

object DataTest{

  import scala.reflect.runtime.universe.TypeTag

  case class Credit(
    creditability: Double,
    balance: Double, duration: Double, history: Double, purpose: Double, amount: Double,
    savings: Double, employment: Double, instPercent: Double, sexMarried: Double, guarantors: Double,
    residenceDuration: Double, assets: Double, age: Double, concCredit: Double, apartment: Double,
    credits: Double, occupation: Double, dependents: Double, hasPhone: Double, foreign: Double
  )

  def main(args: Array[String]) = {

    //Start the Spark context
    val conf = new SparkConf()
      .setAppName("DataTest")
      .setMaster("local")
    val sc = new SparkContext(conf)

    val sqlContext= new org.apache.spark.sql.SQLContext(sc)

    import sqlContext.implicits._


  // function to create a  Credit class from an Array of Double
    def parseCredit(line: Array[Double]): Credit = {
    Credit(
      line(0),
      line(1) - 1, line(2), line(3), line(4) , line(5),
      line(6) - 1, line(7) - 1, line(8), line(9) - 1, line(10) - 1,
      line(11) - 1, line(12) - 1, line(13), line(14) - 1, line(15) - 1,
      line(16) - 1, line(17) - 1, line(18) - 1, line(19) - 1, line(20) - 1
    )
  }

// function to transform an RDD of Strings into an RDD of Double
   def parseRDD(rdd: RDD[String]): RDD[Array[Double]] = {
    rdd.map(_.split(",")).map(_.map(_.toDouble))
  }

  val creditDF= parseRDD(sc.textFile("germancredit.csv")).map(parseCredit).toDF().cache()

  creditDF.registerTempTable("credit")

  creditDF.printSchema

  creditDF.show

  creditDF.groupBy("creditability").avg("balance").show

  sqlContext.sql("SELECT creditability, avg(balance) as avgbalance, avg(amount) as avgamt, avg(duration) as avgdur  FROM credit GROUP BY creditability ").show

  //define the feature columns to put in the feature vector
  val featureCols = Array("balance", "duration", "history", "purpose", "amount",
    "savings", "employment", "instPercent", "sexMarried",  "guarantors",
    "residenceDuration", "assets",  "age", "concCredit", "apartment",
    "credits",  "occupation", "dependents",  "hasPhone", "foreign" )

//set the input and output column names

  val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

//return a dataframe with all of the  feature columns in  a vector column
  val df2 = assembler.transform(creditDF)

// the transform method produced a new column: features.
  df2.show

  sc.stop



   }

}

当我运行mvn clean install时，我得到以下内容：

错误：错误的符号引用。 PipelineStage.class中的签名是指术语内部在包org.apache.spark中，这是不可用的。它可能完全从当前类路径或版本中丢失类路径可能与编译PipelineStage.class时使用的版本不兼容。 val assembler = new VectorAssembler（）。setInputCols（featureCols）.setOutputCol（“features”）

似乎问题发生在电话val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

当我运行mvn clean package时，我得到：

无法执行目标 org.scala-tools：maven-scala-plugin：2.15.2：在项目上编译（默认） spark：wrap：org.apache.commons.exec.ExecuteException：进程已退出错误：1（退出值：1） - ＆gt; [帮助1]

我的pom.xml文件：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.test</groupId>
  <artifactId>spark</artifactId>
  <version>0.0.1-SNAPSHOT</version>

  <pluginRepositories>
        <pluginRepository>
            <id>scala-tools.org</id>
            <name>Scala-tools Maven2 Repository</name>
            <url>http://scala-tools.org/repo-releases</url>
        </pluginRepository>
    </pluginRepositories>


    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version>1.6.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
            <version>1.6.0</version>
        </dependency>   


        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib_2.10</artifactId>
            <version>2.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib-local_2.10</artifactId>
            <version>2.0.1</version>
        </dependency>

        <dependency>
            <groupId>com.databricks</groupId>
            <artifactId>spark-csv_2.10</artifactId>
            <version>1.2.0</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!-- mixed scala/java compile -->
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                    <version>2.15.2</version>
                <executions>

                    <execution>
                        <id>compile</id>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                        <phase>compile</phase>
                    </execution>

                    <execution>
                        <id>test-compile</id>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                        <phase>test-compile</phase>
                    </execution>


                    <execution>
                        <phase>process-resources</phase>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                    </execution>

                </executions>
            </plugin>


            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                  <version>3.6.1</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                </configuration>
            </plugin>
            <!-- for fatjar -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.4</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>assemble-all</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <version>3.0.2</version>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <mainClass>fully.qualified.MainClass</mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
        </plugins>
        <pluginManagement>
            <plugins>
                <!--This plugin's configuration is used to store Eclipse m2e settings 
                    only. It has no influence on the Maven build itself. -->
                <plugin>
                    <groupId>org.eclipse.m2e</groupId>
                    <artifactId>lifecycle-mapping</artifactId>
                    <version>1.0.0</version>
                    <configuration>
                        <lifecycleMappingMetadata>
                            <pluginExecutions>
                                <pluginExecution>
                                    <pluginExecutionFilter>
                                        <groupId>org.scala-tools</groupId>
                                        <artifactId>
                                            maven-scala-plugin
                                        </artifactId>
                                        <versionRange>
                                            [2.15.2,)
                                        </versionRange>
                                        <goals>
                                            <goal>compile</goal>
                                            <goal>testCompile</goal>
                                        </goals>
                                    </pluginExecutionFilter>
                                    <action>
                                        <execute></execute>
                                    </action>
                                </pluginExecution>
                            </pluginExecutions>
                        </lifecycleMappingMetadata>
                    </configuration>
                </plugin>
            </plugins>
        </pluginManagement>
</build>


</project>

关于如何解决错误的任何建议都非常有用，谢谢。

Answer 1

请务必从org.apache.spark包中导入相同版本的所有内容。

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.10</artifactId>
        <version>1.6.0</version>
    </dependency>   
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_2.10</artifactId>
        <version>2.0.1</version>
    </dependency>

在这里，有些东西使用版本1.6.0和其他2.0.1 - 我认为火花神器都需要它们的版本相同。

在Eclipse / Spark IDE中使用Execute Exception进行错误的符号引用

1 个答案: