我第一次从三个pyspark.ml.feature(令牌,CV,idf)构建了一条管道,所有丁字裤都运行良好,但是第二次尝试告诉我Py4JJavaError:调用o175.fit时发生错误。 有谁知道这个错误的原因是什么
import findspark
findspark.init()
import pyspark.sql.types as typ
import pyspark as ps
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import warnings
from pyspark.sql import SQLContext
sparkSession = SparkSession.builder \
.master("local[2]") \
.appName("Pyspark Sentiment") \
.getOrCreate()
df = sparkSession.read.load('data/Microblog_Trialdata.csv',
format='com.databricks.spark.csv',
header='true',
inferSchema='true')
df=df.select("sentiment score","spans")
(train_set, val_set, test_set) = df.randomSplit([0.6, 0.2, 0.2], seed = 42)
from pyspark.ml.feature import HashingTF, IDF, Tokenizer ,CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
tokenizer = Tokenizer(inputCol="spans", outputCol="words")
CV = CountVectorizer(vocabSize=2**11, inputCol="words", outputCol='cv_')
idf = IDF(inputCol='cv_', outputCol="features", minDocFreq=5) #minDocFreq:
remove sparse terms
#model=CV.fit(data)
#vo=model.vocabulary
#print(type(vo))
pipeline = Pipeline(stages=[tokenizer, CV, idf])
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.select("cv_").show(5,truncate=False)
train_df.show(5)
答案 0 :(得分:0)
train_set中的单词但在val_set中看不到的单词可能导致错误。 Count Vectorizer具有handleInvalid选项可以解决此问题。
# this will ignore not seen words
CV = CountVectorizer(vocabSize=2**11, inputCol="words", outputCol='cv_',handleInvalid='skip')