Question

一周以来，我一直在处理Apache Spark上一个非常烦人的问题。我正在使用Apache Kafka使用createDirectStream（）流式传输到Apache Spark。然而，几个小时后总会有一个工作卡住（1/2），然后没有其他事情发生。当我在WebUI上单击（kill）时，Spark会崩溃。

直到昨天我使用Spark 2.2.0，然后我升级到2.2.1版本，希望它能解决我的问题，但事实并非如此。但是，通常需要3-4个小时才能发生这种情况，但今天需要23.7小时（我相信这是由于版本升级）。我在独立模式下运行Spark（在一台机器上）。

下面是我的WebUI截图列表，我在发现Spark被卡住后再拿了这些截图：

这是我的完整代码：

from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
import pprint
import sys
import os
sys.path.insert(0, '/home/servername/public_html/keywordproto/')
from mysettings import INTERVAL_SPARK_WINDOW, INTERVAL_SPARK_STREAM, kafkaServerEndPoint, kafkaTopic_A, kafkaTopic_B, logException
# INTERVAL_SPARK_WINDOW = 20
# INTERVAL_SPARK_STREAM = 20

conf = SparkConf()
conf.setAppName("keywordSparkConsumer")

sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sc.addPyFile('/home/servername/public_html/keywordproto/DAL.py')
sc.addPyFile('/home/servername/public_html/keywordproto/WordStatDict.py')
sc.addPyFile('/home/servername/public_html/keywordproto/mysettings.py')

ssc = StreamingContext(sc, INTERVAL_SPARK_STREAM)
ssc.checkpoint("checkpoints_spark")

DS = KafkaUtils.createDirectStream(ssc, [kafkaTopic_A, kafkaTopic_B], {"metadata.broker.list": kafkaServerEndPoint})
root = DS.map(lambda tup: json.loads(tup[1].strip())) #kafka sends (k,v) tuple

news = root.filter(lambda js: js['type'] == 'news')
news_b = news.map(lambda js: (js['keyword'], 1)).reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), INTERVAL_SPARK_WINDOW) 
social = root.filter(lambda js: js['type'] == 'social')
social_b = social.map(lambda js: (js['keyword'], 1)).reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), INTERVAL_SPARK_WINDOW) 

def process_rdd_news(time, part_iterator):
    import DAL
    from mysettings import logException
    print("----------- %s ----------- news --" % str(time))
    try:
        # you may want to create a db connection here and pass it as a parameter in the loop
        client = DAL.openConnection()
        for part in part_iterator:
            print(part)
            print()

            keyword = part[0]
            nums = part[1]
            DAL.store_mentions_news(client, nums, time, keyword)
    except:
        logException(locals())

news_b.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_news(time,parti) ))


def process_rdd_social(time, part_iterator):
    import DAL
    from mysettings import logException
    print("----------- %s ----------- social --" % str(time))
    try:
        # you may want to create a db connection here and pass it as a parameter in the loop
        client = DAL.openConnection()
        for part in part_iterator:
            print(part)
            print()

            keyword = part[0]
            nums = part[1]
            DAL.store_mentions_social(client, nums, time, keyword)

    except:
        logException(locals())

social_b.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_social(time,parti) ))



def remove_nonASCII(str): return str.encode("ascii", errors="ignore").decode()

def socialMentionScore(body, html, ppr, nltk):

    slag = html.unescape(body)
    cleantxt = ppr.clean(slag) # removes urls, etc...
    parsed_slag = ppr.parse(slag)
    # ... arithmetic calculations
    good = 1
    return good #float : [0.0 ; 1.0]

def processAndValidateB(mention, keyword, DICT, nltk, dict_Keywords, logException, html, ppr):
    retobj = { 'cats':{}, 'mention':mention,  'keeper':0, }
    try:
        editedMention=remove_nonASCII(mention)
        words = nltk.wordpunct_tokenize(mention)
        catMatrix = {}
        for cat in DICT.keys():
            count = len(DICT[cat].intersection([w.lower() for w in words]))
            if count > 0:
                catMatrix[cat] = count

        retobj['cats'] = catMatrix
        retobj['mention'] = editedMention 
        lenw = len(words)
        if len(set(dict_Keywords[keyword]).intersection([w.lower() for w in words])) <= 0:
            retobj['keeper']=0 
        elif socialMentionScore(editedMention, html, ppr, nltk) < 0.90:
            retobj['keeper']=0 
        else:
            retobj['keeper']=1
    except:
        logException(locals())
    return retobj

def processAndValidateA(title, mention, keyword, DICT, nltk, logException):
    retobj = { 'cats':{},  'keeper':0, }
    try:
        mention = title + "\n" + mention
        body_words = nltk.wordpunct_tokenize(mention)
        title_words = nltk.wordpunct_tokenize(title)
        catMatrix = {}
        for cat in DICT.keys():
            count = len(DICT[cat].intersection([w.lower() for w in body_words]))
            if count > 0:
                catMatrix[cat] = count

        retobj['cats'] = catMatrix
        lenw = len(title_words)
        retobj['keeper']=1
    except:
        logException(locals())
    return retobj




def process_rdd_methodB(time, part_iterator):
    # I believe Spark crashes because of this function
    import DAL
    from WordStatDict import DICT # this is a dict with 4k values
    import nltk
    from mysettings import dict_Keywords, logException
    import html
    import preprocessor as ppr
    print("----------- %s ----------- methodB --" % str(time))
    try:
        keepers = [] 
        keyword_scores = {}
        client = DAL.openConnection()
        for part in part_iterator:
            try:
                keyword = part["keyword"]
                body = part["body"]
                result = processAndValidateB(body, keyword, DICT, nltk, dict_Keywords, logException, html, ppr)
                if not keyword in keyword_scores:
                    keyword_scores[keyword] = {}
                for cat in result['cats']:
                    if not cat in keyword_scores[keyword]:
                        keyword_scores[keyword][cat] = 0 
                    keyword_scores[keyword][cat] += result['cats'][cat]
                if result['keeper'] == 1:
                    part["body"] = result['mention'] # the text has changed due to NLP
                    keepers.append(part)
            except:
                logException(locals())

        print("keepers: " + str(len(keepers)))
        if len(keepers) > 0:
            DAL.store_items_bulk(client, keepers, time);

        print("keyword_scores:")
        print(keyword_scores)
        print()
        if len(keyword_scores) > 0:
            for keyword, slugs in keyword_scores.items():
                if len(slugs) > 0:
                    DAL.store_scores_social(client, slugs, time, keyword);

    except:
        logException(locals())

social.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_methodB(time,parti) ))


def process_rdd_methodA(time, part_iterator):
    # I believe Spark crashes because of this function
    import DAL
    from WordStatDict import DICT # this is a dict with 4k values
    import nltk
    from mysettings import logException
    print("----------- %s ----------- methodA --" % str(time))
    try:
        keepers = []
        keyword_scores = {}
        client = DAL.openConnection()
        for part in part_iterator:
            try:
                keyword = part["keyword"]
                body = part["body"]
                title = part["title"]
                result = processAndValidateA(title, body, keyword, DICT, nltk, logException)
                if not keyword in keyword_scores:
                    keyword_scores[keyword] = {}
                for cat in result['cats']:
                    if not cat in keyword_scores[keyword]:
                        keyword_scores[keyword][cat] = 0 
                    keyword_scores[keyword][cat] += result['cats'][cat]
                if result['keeper'] == 1:
                    del part['body']
                    keepers.append(part)
            except:
                logException(locals())

        print("keepers: " + str(len(keepers)))
        if len(keepers) > 0:
            DAL.store_items_bulk(client, keepers, time);

        print("keyword_scores:")
        print(keyword_scores)
        print()

        if len(keyword_scores) > 0:
            for keyword, slugs in keyword_scores.items():
                if len(slugs) > 0:
                    DAL.store_scores_news(client, slugs, time, keyword);

    except:
        logException(locals())

news.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_methodA(time,parti) ))

# http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
# note: mongodb has a built-in connection pool
try:
    ssc.start()
    ssc.awaitTermination()
except:
    logException(locals())

这就是我启动Apache Spark的方式：

def startProc(inp):
    print('  '+"   starting...")
    subprocess.Popen(inp, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, preexec_fn=os.setpgrp) # PIPE / DEVNULL / STDOUT

startProc('spark-submit --driver-memory 1g --executor-memory 10g --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1 sparkConsumer.py')

我一直在试验很多参数，例如添加驱动程序和执行程序内存，但它没有做任何事情。也许我的代码中存在内存泄漏，我尝试使用库并添加py文件，并在sparkConsumer.py中包含所有代码 - 但两者都没有任何区别。

从我的截图中可以看出，每个批次在2秒内完成，每20秒只有大约100-200条记录，这是一个非常低的数量。所以我不明白问题是什么...... 我也无法访问任何Spark日志，我找不到任何。但是我可以将stdout / stderr PIPE到一个文件，但是只在Spark启动时写入INFO日志和我自己的print（...）s。仅供参考，我正在使用PySpark。

我还认为Kafka可能存在问题，可能会崩溃/关闭其连接，但Kafka的日志并不表示可能会破坏Spark中的流的任何崩溃。

也许Web UI是内存泄漏？它有很多条目，我想知道它们存储在哪里？我希望不要在我的记忆中...... 仅供参考我正在实时处理数据，所以我不需要长时间存储任何RDD，但Spark可能会将它们存储在内存（缓存或其他东西）中，导致它充斥JVM堆空间？但是我在任何地方都没有看到任何JVM OOM / Heap错误。

我怀疑的另一件事是批处理完成总是停留在2/4，而在我的代码中，正好使用了4个foreachRDD函数。我怀疑前两个执行没有任何问题，但它总是卡在第三个，但为什么？我加载一个有4k值（短字串）的字典，我希望在函数完成之后它被Spark进行垃圾清理？

我还能尝试解决这个问题吗？如果它是一个OOM问题，那么我可能会编写代码，每分钟发送数千条消息，这会加速工作失败。但我需要调试/分析系统的方法：）

Answer 1

以下是有关情况的最新消息。我有一台新服务器并安装了一个新的Spark 2.2.1版本（Kafka，Zookeeper）并迁移了我自己的文件。经过几个小时的测试和推文，我能够删除python文件中的一些内存泄漏（与Spark无关），但它们是OOM的原因。

不幸的是，Spark一直在崩溃。有一次我在两台服务器上运行Spark，有趣的是，它们都在同一时间（同一小时，分钟，秒）“卡住”。这有多疯狂？只有当它们依赖于相同的东西时才会发生这种情况。在我的例子中，我有Kafka生产者从外部API获取数据，然后我在Spark中使用这些数据。我尝试过各种各样的技巧让Spark自己崩溃，但我没有做过任何工作，Spark刚刚站稳脚跟。因此，由于某些数据，它不太可能被卡住......

但由于Zookeeper和/或Kafka，它可能已经崩溃了。 Kafka每X分钟清理一次日志文件并更新偏移量。我不确定它对Spark中的DirectStream有什么影响。但我有一种感觉，Kafka / Zookeeper是导致这些工作永远“卡住”的原因。

有关如何调试此人的任何想法？ PS：现在我在Spark和JVM的GC（包括Kafka + Zookeeper）中启用了DEBUG级日志。发生卡住时，我将使用日志更新此主题。

Answer 2

好的，昨天Spark＆＃34;卡住了＃34;一个小时以后。我已粘贴下面最相关的日志。以下是Spark UI中卡住作业的截图：

*）它总是在流媒体中遇到任务（1/2）和（2/4）时很疯狂。

2018-01-14 03:02:41 INFO  MemoryStore:54 â<80><93> Block broadcast_1930 stored as values in memory (estimated size 16.6 KB, free 429.6 MB)
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Put block broadcast_1930 locally took  0 ms
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 releasing lock for broadcast_1930
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Putting block broadcast_1930 without replication took  1 ms
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 trying to put broadcast_1930_piece0
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 trying to acquire read lock for broadcast_1930_piece0
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 trying to acquire write lock for broadcast_1930_piece0
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 acquired write lock for broadcast_1930_piece0
2018-01-14 03:02:41 INFO  MemoryStore:54 â<80><93> Block broadcast_1930_piece0 stored as bytes in memory (estimated size 8.7 KB, free 429.6 MB)
2018-01-14 03:02:41 INFO  BlockManagerInfo:54 â<80><93> Added broadcast_1930_piece0 in memory on 10.17.0.6:46478 (size: 8.7 KB, free: 434.0 MB)
2018-01-14 03:02:41 DEBUG BlockManagerMaster:58 â<80><93> Updated info of block broadcast_1930_piece0
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Told master about block broadcast_1930_piece0
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Put block broadcast_1930_piece0 locally took  1 ms
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 releasing lock for broadcast_1930_piece0
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Putting block broadcast_1930_piece0 without replication took  1 ms
2018-01-14 03:02:41 INFO  SparkContext:54 â<80><93> Created broadcast 1930 from broadcast at DAGScheduler.scala:1006
2018-01-14 03:02:41 INFO  DAGScheduler:54 â<80><93> Submitting 2 missing tasks from ResultStage 1608 (PythonRDD[4989] at call at /usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/j        ava_gateway.py:2230) (first 15 tasks are for partitions Vector(0, 1))
2018-01-14 03:02:41 INFO  TaskSchedulerImpl:54 â<80><93> Adding task set 1608.0 with 2 tasks
2018-01-14 03:02:41 DEBUG TaskSetManager:58 â<80><93> Epoch for TaskSet 1608.0: 322
2018-01-14 03:02:41 DEBUG TaskSetManager:58 â<80><93> Valid locality levels for TaskSet 1608.0: ANY
2018-01-14 03:02:41 DEBUG TaskSchedulerImpl:58 â<80><93> parentName: , name: TaskSet_1608.0, runningTasks: 0
2018-01-14 03:02:41 INFO  TaskSetManager:54 â<80><93> Starting task 0.0 in stage 1608.0 (TID 1928, localhost, executor driver, partition 0, ANY, 4741 bytes)
2018-01-14 03:02:41 INFO  Executor:54 â<80><93> Running task 0.0 in stage 1608.0 (TID 1928)
2018-01-14 03:02:41 DEBUG Executor:58 â<80><93> Task 1928's epoch is 322
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Getting local block broadcast_1930
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task 1928 trying to acquire read lock for broadcast_1930
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task 1928 acquired read lock for broadcast_1930
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Level for block broadcast_1930 is StorageLevel(disk, memory, deserialized, 1 replicas)
2018-01-14 03:02:41 INFO  KafkaRDD:54 â<80><93> Beginning offset 639 is the same as ending offset skipping mentionsNews 0
2018-01-14 03:02:41 INFO  PythonRunner:54 â<80><93> Times: total = 41, boot = -297, init = 338, finish = 0
2018-01-14 03:02:42 INFO  PythonRunner:54 â<80><93> Times: total = 44, boot = -76, init = 120, finish = 0
2018-01-14 03:02:42 INFO  PythonRunner:54 â<80><93> Times: total = 48, boot = -42, init = 90, finish = 0
2018-01-14 03:02:42 INFO  PythonRunner:54 â<80><93> Times: total = 48, boot = 36, init = 9, finish = 3
2018-01-14 03:02:42 TRACE BlockInfoManager:62 â<80><93> Task 1928 releasing lock for broadcast_1930
2018-01-14 03:02:42 INFO  Executor:54 â<80><93> Finished task 0.0 in stage 1608.0 (TID 1928). 1267 bytes result sent to driver
2018-01-14 03:02:42 DEBUG TaskSchedulerImpl:58 â<80><93> parentName: , name: TaskSet_1608.0, runningTasks: 0
2018-01-14 03:02:42 INFO  TaskSetManager:54 â<80><93> Starting task 1.0 in stage 1608.0 (TID 1929, localhost, executor driver, partition 1, ANY, 4743 bytes)
2018-01-14 03:02:42 INFO  TaskSetManager:54 â<80><93> Finished task 0.0 in stage 1608.0 (TID 1928) in 201 ms on localhost (executor driver) (1/2)
2018-01-14 03:02:42 INFO  Executor:54 â<80><93> Running task 1.0 in stage 1608.0 (TID 1929)
2018-01-14 03:02:42 DEBUG Executor:58 â<80><93> Task 1929's epoch is 322
2018-01-14 03:02:42 DEBUG BlockManager:58 â<80><93> Getting local block broadcast_1930
2018-01-14 03:02:42 TRACE BlockInfoManager:62 â<80><93> Task 1929 trying to acquire read lock for broadcast_1930
2018-01-14 03:02:42 TRACE BlockInfoManager:62 â<80><93> Task 1929 acquired read lock for broadcast_1930
2018-01-14 03:02:42 DEBUG BlockManager:58 â<80><93> Level for block broadcast_1930 is StorageLevel(disk, memory, deserialized, 1 replicas)
2018-01-14 03:02:42 INFO  KafkaRDD:54 â<80><93> Computing topic mentionsSocial, partition 0 offsets 4815532 -> 4815614
2018-01-14 03:02:42 INFO  VerifiableProperties:68 â<80><93> Verifying properties
2018-01-14 03:02:42 INFO  VerifiableProperties:68 â<80><93> Property group.id is overridden to
2018-01-14 03:02:42 INFO  VerifiableProperties:68 â<80><93> Property zookeeper.connect is overridden to
2018-01-14 03:02:42 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from nevolin2:9092
2018-01-14 03:02:42 DEBUG BlockingChannel:52 â<80><93> Created socket with SO_TIMEOUT = 30000 (requested 30000), SO_RCVBUF = 65536 (requested 65536), SO_SNDBUF = 1313280 (requested         -1), connectTimeoutMs = 30000.
2018-01-14 03:02:42 TRACE BoundedByteBufferSend:36 â<80><93> 66 bytes written.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 8192 bytes read.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 8192 bytes read.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 8192 bytes read.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 4314 bytes read.
2018-01-14 03:02:42 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from nevolin2:9092
2018-01-14 03:02:42 INFO  PythonRunner:54 â<80><93> Times: total = 44, boot = -44, init = 87, finish = 1
2018-01-14 03:02:42 INFO  PythonRunner:54 â<80><93> Times: total = 61, boot = 59, init = 0, finish = 2
2018-01-14 03:02:42 INFO  PythonRunner:54 â<80><93> Times: total = 47, boot = -39, init = 85, finish = 1



2018-01-14 03:03:00 DEBUG RecurringTimer:58 â<80><93> Callback for JobGenerator called at time 1515898980000
2018-01-14 03:03:00 DEBUG JobGenerator:58 â<80><93> Got event GenerateJobs(1515898980000 ms)
2018-01-14 03:03:00 DEBUG DStreamGraph:58 â<80><93> Generating jobs for time 1515898980000 ms
2018-01-14 03:03:00 DEBUG PythonReducedWindowedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 INFO  PythonTransformedDStream:54 â<80><93> Slicing from 1515898980000 ms to 1515898980000 ms (aligned to 1515898980000 ms and 1515898980000 ms)
2018-01-14 03:03:00 DEBUG PythonTransformedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG PythonTransformedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG PythonTransformedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG DirectKafkaInputDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from localhost:9092
2018-01-14 03:03:00 DEBUG BlockingChannel:52 â<80><93> Created socket with SO_TIMEOUT = 30000 (requested 30000), SO_RCVBUF = 65536 (requested 65536), SO_SNDBUF = 1313280 (requested         -1), connectTimeoutMs = 30000.
2018-01-14 03:03:00 TRACE BoundedByteBufferSend:36 â<80><93> 48 bytes written.
2018-01-14 03:03:00 TRACE BoundedByteBufferReceive:36 â<80><93> 124 bytes read.
2018-01-14 03:03:00 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from localhost:9092

从日志中我们可以看到时间03:02:42的最后一个条目是＆＃34; PythonRunner＆＃34;。但是，在正常情况下，PythonRunner应该会发布一个＆＃34;释放锁定...＆＃34;声明，但从未发生过。它看起来要么陷入Python代码中，要么存在死锁。

- 编辑

当我通过WebUI（杀死）作业时，我在日志中得到了这个例外：

py4j.protocol.Py4JJavaError: An error occurred while calling o32.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/util.py", line 65, in call
    r = self.func(t, *rdds)
  File "/home/nevolin/public_html/proto/consumers/sparkConsumer_A.py", line 209, in <lambda>
    social.foreachRDD(lambda time,rdd: rdd_process(time, rdd, process_rdd_sentimentAnalysis_social) )
  File "/home/nevolin/public_html/proto/consumers/sparkConsumer_A.py", line 39, in rdd_process
    rdd.foreachPartition(lambda parti: func(time, parti))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 799, in foreachPartition
    self.mapPartitions(func).count()  # Force evaluation
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1041, in count
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1032, in sum
    return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 906, in fold
    vals = self.mapPartitions(func).collect()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 809, in collect
    port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job 964 cancelled
        at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
        at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1457)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1704)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
        at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
        at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)

它似乎陷入了collectAndServe方法，但为什么。

几个小时后，Spark作业挂起/崩溃

2 个答案: