你好,我刚开始学习scala。 并按照udemy中的教程进行操作。 我遵循相同的代码,但给我一个错误。 我不知道那个错误。
这是我的代码
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.SparkSession
import org.apache.log4j._
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession.builder().getOrCreate()
val data = spark.read.option("header","true").
option("inferSchema","true").
option("delimiter","\t").
format("csv").
load("dataset.tsv").
withColumn("subject", split($"subject", " "))
val logRegDataAll = (data.select(data("label")).as("label"),$"subject")
val logRegData = logRegDataAll.na.drop()
并给我这样的错误
scala> :load LogisticRegression.scala
Loading LogisticRegression.scala...
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.SparkSession
import org.apache.log4j._
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1efcba00
data: org.apache.spark.sql.DataFrame = [label: string, subject: array<string>]
logRegDataAll: (org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], org.apache.spark.sql.ColumnName) = ([label: string],subject)
<console>:43: error: value na is not a member of (org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], org.apache.spark.sql.ColumnName)
val logRegData = logRegDataAll.na.drop()
^
感谢您的帮助
答案 0 :(得分:0)
你可以清楚地看到
val logRegDataAll = (data.select(data("label")).as("label"),$"subject")
返回
(org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], org.apache.spark.sql.ColumnName)
所以有一个额外的parantheses )
数据(“label”)),实际上应该是data.select(data("label").as("label"),$"subject")
。