我正在使用PySpark 2.2.1,并试图通过spark StandardScaler来对我的特征值进行标准缩放。
但是,我陷入了一个棘手的问题:
在我的应用程序中,如果我以 local 模式SparkSession.builder.master("local[2]")
初始化SparkSession实例并执行它,那么当它抛出 EOFError 时,它就会成功。 >在我以 yarn 模式SparkConf().setMaster("yarn")
下面是我的代码:
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import StandardScaler, QuantileDiscretizer, Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row, SparkSession
def standard_scale_run():
df = spark.createDataFrame(
[(Vectors.dense([0.0, 1.0, 4.0, 5.5, 6.5, 7.0, 8.0, 8.3, 9.1]),),
(Vectors.dense([2.0, 3.0, 2.0, 10.5, 3.5, 7.5, 3.0, 2.3, 10.5]),)], ["a"])
print("===================dataframe: {}=========end!==========".format(df))
standardScaler = StandardScaler(withMean=True, inputCol="a", outputCol="scaled_a") #
model = standardScaler.fit(df)
scaled_data = model.transform(df)
scaled_data.show()
spark = ""
if __name__ == '__main__':
# spark = SparkSession.builder.master("local[2]").appName("ml.feature tests").getOrCreate()
conf = SparkConf().setMaster("yarn").setAppName("ml.feature tests")
# conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
# sc = SparkContext(conf=conf, serializer=CloudPickleSerializer())
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
standard_scale_run()
# test()
# discretize_run()
# normalize_run()
这是堆栈跟踪:
2018-08-31 15:03:52,940 WARN (task-result-getter-3) [Logging.scala:logWarning(66)] - Lost task 1.0 in stage 0.0 (TID 1, ***, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/pyspark/serializers.py", line 422, in loads
return pickle.loads(obj)
EOFError
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/pyspark/serializers.py", line 422, in loads
return pickle.loads(obj)
EOFError