我在PySpark中玩机器学习并使用RandomForestClassifier。我到现在一直使用Sklearn。我正在使用CrossValidator来调整参数并获得最佳模型。以下是Spark网站的示例代码。
根据我一直在阅读的内容,我不明白spark是否也会分配参数调整,或者与Sklearn的GridSearchCV相同。
任何帮助都会非常感激。
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0),
(4, "b spark who", 1.0),
(5, "g d a y", 0.0),
(6, "spark fly", 1.0),
(7, "was mapreduce", 0.0),
(8, "e spark program", 1.0),
(9, "a e c l", 0.0),
(10, "spark compile", 1.0),
(11, "hadoop software", 0.0)
], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=2) # use 3+ folds in practice
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)
答案 0 :(得分:3)
Spark 2.3 +
SPARK-21911包括并行模型拟合。并行度由parallelism
Param
控制。
Spark< 2.3 强>
没有。交叉验证以普通nested for
loop:
for i in range(nFolds):
...
for j in range(numModels):
...
仅分发培训单个模型的过程。
答案 1 :(得分:0)
我找到了答案。正如其他用户所回答的那样,该过程不是parralelized是一个串行操作。但是,有一个spark_sklearn模块可以用于此网格搜索,它可以分发它,但不会分发模型构建。这就是权衡。
这是使用spark_sklearn GridSearchCV
的代码%pyspark
"""
DATA - https://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz
METHOD 1 - USING GRIDSEARCH CV FROM SPARK_SKLEARN MODULE BY DATABRICKS
DOCUMENTATION - https://databricks.com/blog/2016/02/08/auto-scaling-scikit-learn-with-apache-spark.html
THIS IS DISTRIBUTED OPERATION AS MENTIONED ON THE WEBSITE
"""
from spark_sklearn import GridSearchCV
from pyspark.ml.feature import HashingTF,StopWordsRemover,IDF,Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql.types import StructField, StringType, StructType
from pyspark.ml.feature import IndexToString, StringIndexer
from spark_sklearn.converter import Converter
from sklearn.pipeline import Pipeline as S_Pipeline
from sklearn.ensemble import RandomForestClassifier as S_RandomForestClassifier
path = 's3://sparkzepellin/mini_newsgroups//*'
news = sc.wholeTextFiles(path)
print "Toal number of documents = ",news.count()
# print 5 samples
news.takeSample(False,5, 1)
# Using sqlContext createa dataframe
schema = ["id", "text", "topic"]
fields = [StructField(field_name, StringType(), True) for field in schema]
schema = StructType(fields)
# Applying the schema decalred above as an RDD
newsgroups = news.map(lambda (localPath, text): (localPath.split("/")[-1], text, localPath.split("/")[-2]))
df = sqlContext.createDataFrame(newsgroups, schema)
df_new = StringIndexer(inputCol="topic", outputCol="label").fit(df).transform(df)
# Build a pipeline with tokenier, hashing TF, IDF, and finally a RandomForest
tokenizer = Tokenizer().setInputCol("text").setOutputCol("words")
hashingTF = HashingTF().setInputCol("words").setOutputCol("rawFeatures")
idf = IDF().setInputCol("rawFeatures").setOutputCol("features")
pipeline=Pipeline(stages=[tokenizer, hashingTF, idf])
data = pipeline.fit(df_new).transform(df_new)
# Using Converter, convert to pandas dataframe (numpy)
# to run on distributed sklearn using spark_sklearn package
converter = Converter(sc)
new_df = Converter.toPandas(data.select(data.features.alias("text"), "label"))
# Sklearn pipeline
s_pipeline = S_Pipeline([
('rf', S_RandomForestClassifier())
])
# Random parameters
parameters = {
'rf__n_estimators': (10, 20),
'rf__max_depth': (2, 10)
}
# Run GridSearchCV using the above defined parameters on the pipeline created
gridSearch = GridSearchCV(sc, s_pipeline, parameters)
GS = gridSearch.fit(new_df.text.values, new_df.rating.values)
另一种方法是使用map方法来并行化操作并获取精度等指标。