我正在一个具有240GB
内存和64个内核的数据块集群中工作。这是我定义的设置。
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fs
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import count
from pyspark.sql.functions import col, countDistinct
from pyspark import SparkContext
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars
from geospark.register import GeoSparkRegistrator
spark.conf.set("spark.sql.shuffle.partitions", 1000)
#Recommended settings for using GeoSpark
spark.conf.set("spark.driver.memory", "20g")
spark.conf.set("spark.network.timeout", "1000s")
spark.conf.set("spark.driver.maxResultSize", "10g")
spark.conf.set("spark.serializer", KryoSerializer.getName)
spark.conf.set("spark.kryo.registrator", GeoSparkKryoRegistrator.getName)
upload_jars()
SparkContext.setSystemProperty("geospark.global.charset","utf8")
spark.conf.set
我正在使用大型数据集,这是我在运行数小时后得到的错误。
org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 10.0 failed 4 times, most recent failure: Lost task 3.3 in stage 10.0 (TID 6054, 10.17.21.12, executor 7):
ExecutorLostFailure (executor 7 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 170684 ms