我正在尝试运行Spark.ML.RandomForestClassifier
我的训练数据是一堆文档,我使用Tokenizer,CountVectorizer,IDF转换为特征,并获得了如下定义的tf-idf特征的最终SparseVector。我有一个标签栏,其中包含文字{'男'女'中立'},我使用StringIndexer将其转换为名义变量。以下是数据现在的样子
这是我用来创建功能的代码和相应的ml方法。
class Jpsa(object):
""" Class for ML implementation
"""
def __init__(self,jpsa_sdf,split_perc=[0.7,0.3]):
""" Constructor to initialze the data
"""
# Splitting the data into training and test
self.jpsa_train,self.jpsa_test=jpsa_sdf.randomSplit(split_perc,seed=42)
self.jpsa_train.cache().count()
self.jpsa_test.cache().count()
def tf_idf(self):
"""
Feature transformation using TF-IDF
Input:
Param: Input Dataframe
"""
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
# Changing the labels to numeric from categorical
self.stringIndexer=StringIndexer(inputCol='label',outputCol='gender_label')
# Carrying out the Tokenization of the text documents (splitting into words)
self.tokenizer = Tokenizer(inputCol="job_desc", outputCol="tokenised_text")
# Carrying out the StopWords Removal for TF-IDF
self.stopwordsremover=StopWordsRemover(inputCol='tokenised_text',outputCol='sw_removed_words')
# Creating Term Frequency Vector for each word
self.cv=CountVectorizer(inputCol="sw_removed_words", outputCol="tf_features", minDF=2.0)
# Carrying out Inverse Document Frequency on the TF data
self.idf=IDF(inputCol="tf_features", outputCol="output_features")
return [self.stringIndexer,self.tokenizer,self.stopwordsremover,self.cv,self.idf]
def rf_classifier(self):
""" This method builds RandomForest Classifier
"""
from pyspark.ml.classification import RandomForestClassifier
# Takes the "features" column and learns to predict "label"
self.rf = RandomForestClassifier(featuresCol="output_features",labelCol="gender_label")
return [self.rf]
现在我实例化这个类,它将使用定义的构造函数将原始数据拆分为训练和测试。
jpsa=Jpsa(jpsa_cleaned_sdf,[0.7,0.3])
然后它创建一个Spark Pipeline对象。
from pyspark.ml import Pipeline
#pipeline = Pipeline(stages=[jpsa.stringIndexer,jpsa.tokenizer,jpsa.stopwordsremover,jpsa.cv,jpsa.idf,jpsa.rf])
#pipeline = Pipeline(stages=jpsa.tfidf()+jpsa.rf_classifier())
pipeline=Pipeline(stages=jpsa.tf_idf()+jpsa.rf_classifier())
现在,当我使用超参数网格空间设置交叉验证器对象时。 我将此管道对象传递给交叉验证器对象以进行超参数调整
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Define a grid of hyperparameters to test:
# - maxDepth: max depth of each decision tree in the RF ensemble
# - maxIter: iterations, i.e., number of trees in each RF ensemble
# In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder()\
.addGrid(jpsa.rf.maxDepth, [2,4,7,10])\
.addGrid(jpsa.rf.getNumTrees(), [100,250,500,600,700,800,900])\
.build()
# We define an evaluation metric. This tells CrossValidator how well we are doing by comparing
#the true labels with predictions.
evaluator = MulticlassClassificationEvaluator(metricName="f1",\
labelCol=jpsa.rf.getLabelCol(), predictionCol=jpsa.rf.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid,numFolds=5)
cvModel=cv.fit(jpsa.jpsa_train)
evaluator.evaluate(cvModel.transform(jpsa.jpsa_train))
这会产生如下错误:
Py4JJavaError: An error occurred while calling o659.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 136.0 failed 1 times, most recent failure: Lost task 0.0 in stage 136.0 (TID 131, localhost): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
编辑II:
所以我做了进一步的调查,并考虑深入(隔离部分)。这次我没有使用Pipeline对象,只是单独使用Random Forest。这是我现在的代码。我不再使用类结构了。
def word2Vec():
""" This function takes in the data frame of the texts and finds the Word vector
representation of that
"""
from pyspark.ml.feature import Tokenizer, Word2Vec,StringIndexer
# Carrying out the Tokenization of the text documents (splitting into words)
# Changing the labels to numeric from categorical
stringIndexer=StringIndexer(inputCol='label',outputCol='label1')
stringdf=stringIndexer.fit(df).transform(df)
tokenizer = Tokenizer(inputCol="job_desc", outputCol="tokenised_text")
tokensdf = tokenizer.transform(stringdf)
# Implementing the word2Vec model
word2Vec = Word2Vec(vectorSize=300, seed=42, inputCol="tokenised_text", outputCol="w2v_vector")
w2vmodel = word2Vec.fit(tokensdf)
w2vdf=w2vmodel.transform(tokensdf)
return w2vdf.select('id','w2v_vector','label1')
def rf_classifier():
""" Random Classifier
"""
from pyspark.ml.classification import RandomForestClassifier
# Takes the "features" column and learns to predict "label"
rf =RandomForestClassifier(featuresCol="w2v_vector",labelCol="label1",maxDepth=7)
return rf
def split(df,split_perc):
""" Splitting the data
"""
jpsa_train,jpsa_test=df.randomSplit(split_perc,seed=42)
jpsa_train.cache().count()
jpsa_test.cache().count()
return jpsa_train, jpsa_test
jpsa_train,jpsa_test=split(jpsa_cleaned_sdf,[0.7,0.3])
rf=rf_classifier()
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Define a grid of hyperparameters to test:
# - maxDepth: max depth of each decision tree in the GBT ensemble
# - maxIter: iterations, i.e., number of trees in each GBT ensemble
# In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder()\
.addGrid(rf.maxDepth, [2,4,7,10])\
.addGrid(rf.numTrees, [100,250,500])\
.build()
# We define an evaluation metric. This tells CrossValidator how well we are doing by comparing
#the true labels with predictions.
evaluator = MulticlassClassificationEvaluator(metricName="f1",\
labelCol=rf.getLabelCol(), predictionCol=rf.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=rf, evaluator=evaluator, estimatorParamMaps=paramGrid,numFolds=5)
cvModel=cv.fit(jpsa_train)
evaluator.evaluate(cvModel.transform(jpsa_train))
0.9999999999