这是我的代码:
def dataPreprocessing(data):
splitData = data.map(lambda line: line.split('`'))
getData = splitData.map(lambda line: [labelMap[line[2]], list(jieba.cut(line[6]+line[13]))])
return getData
trainSql = sqlContext.createDataFrame(dataPreprocessing(trainData)).toDF('label', 'raw')
testSql = sqlContext.createDataFrame(dataPreprocessing(testData)).toDF('label', 'raw')
remover = StopWordsRemover(inputCol="raw", outputCol="filtered").setStopWords(stopWord)
filteredTrain = remover.transform(trainSql)
filteredTest = remover.transform(testSql)
filteredTrain.cache(); filteredTest.cache()
tf = HashingTF(numFeatures=20000, inputCol='filtered', outputCol='rawFeatures')
trainTf = tf.transform(filteredTrain); testTf = tf.transform(filteredTest)
print trainTf.select('rawFeatures').take(5)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(trainTf)
trainIdf = idfModel.transform(trainTf) ; testIdf = idfModel.transform(testTf)
trainIdf.show()
因为我正在处理中文文本,所以我使用python包jieba来分词。 line" print trainTf.select(' rawFeatures')的输出。take(5)"是正确的。 IDF出错,错误如下:
16/07/25 16:12:49 ERROR Executor: Exception in task 4.0 in stage 3.0 (TID 7)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/Users/lyj/Programs/Apache/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/Users/lyj/Programs/Apache/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/Users/lyj/Programs/Apache/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/mypath/classfication.py", line 20, in <lambda>
getData = splitData.map(lambda line: [labelMap[line[2]], list(jieba.cut(line[6]+line[13]))])
IndexError: list index out of range
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:129)
at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:125)
at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
我该怎么办?