我有一个具有以下配置的spark作业:
Configurations=[
{
"Classification": "spark-defaults",
"Properties": {
"spark.executor.memory": "12335M",
"spark.executor.cores": "2",
"spark.executor.instances" : "19",
"spark.yarn.executor.memoryOverhead" : "1396", # spark.executor.memory * 0.10.
"spark.default.parallelism" : "38" #spark.executor.instances * spark.executor.cores
"spark.driver.memory": "45G"
}
}
],
我正在使用带有1个Master和4个Core节点的r4.2xlarge实例。他们有8vCPU和61GB RAM。我在spark上做word2vec的超参数优化。 这是执行超参数优化的代码。
windowSize = [5,10]
minCount = range(5,10)
maxIter= [10,100,1000]
regParam= [0.1,0.01]
pipeline=Pipeline(stages=[transformer_filtered_question1,transformer_filtered_question2,token_q1,token_q2,remover_q1,remover_q2,
transformer_textlength_q1,transformer_textlength_q2,transformer_totalwords,
transformer_commonwords,transformer_difftwolength,
transformer_fuzz_qratio,transformer_fuzz_partial_token_setratio,
transformer_fuzz_partial_token_sortratio,transformer_fuzz_token_setratio,
transformer_fuzz_token_sortratio,transformer_fuzz_partialratio,transformer_fuzz_wratio,
q1w2model,q2w2model,
transformer_manhattan, transformer_braycurtis, transformer_canberra,
transformer_cosine,transformer_euclidean,
transformer_jaccard,transformer_minkowski,transformer_kurtosis_q1,
transformer_kurtosis_q2,transformer_skew_q1,transformer_skew_q2,
assembler,lr])
# paramGrid only takes list of values not integers
paramGrid = ParamGridBuilder() \
.addGrid(q1w2model.windowSize,windowSize) \
.addGrid(q1w2model.minCount,minCount) \
.addGrid(q2w2model.windowSize,windowSize) \
.addGrid(q2w2model.minCount,minCount) \
.addGrid(lr.maxIter,maxIter) \
.addGrid(lr.regParam, regParam) \
.build()
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label', metricName='areaUnderROC')
tvs = TrainValidationSplit(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
trainRatio=0.8)
错误追溯:
> Traceback (most recent call last):
File "/home/hadoop/quora_features_pyspark_emr.py", line 444, in <module>
model = tvs.fit(train)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 64, in fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/tuning.py", line 389, in _fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 59, in fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 62, in fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/pipeline.py", line 108, in _fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 64, in fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/wrapper.py", line 265, in _fit
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/wrapper.py", line 262, in _fit_java
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o15835.fit.
: org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:205)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
at org.apache.spark.storage.BlockManagerMaster.removeRdd(BlockManagerMaster.scala:135)
at org.apache.spark.SparkContext.unpersistRDD(SparkContext.scala:1793)
at org.apache.spark.rdd.RDD.unpersist(RDD.scala:216)
at org.apache.spark.mllib.feature.Word2Vec.doFit(Word2Vec.scala:452)
at org.apache.spark.mllib.feature.Word2Vec.fit(Word2Vec.scala:319)
at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:187)
at sun.reflect.GeneratedMethodAccessor88.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Connection reset by peer
at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
at sun.nio.ch.IOUtil.read(IOUtil.java:192)
at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:380)
at io.netty.buffer.PooledUnsafeDirectByteBuf.setBytes(PooledUnsafeDirectByteBuf.java:221)
at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:899)
at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:275)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:119)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:643)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:566)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:480)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:442)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:131)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)
... 1 more
我已经尝试了几种配置。最初它内存不足但我增加了驱动程序内存。在这种情况下,我们如何测量火花作业的适当配置。任何关于此的指示都将非常有用