一周以来,我一直在处理Apache Spark上一个非常烦人的问题。 我正在使用Apache Kafka使用createDirectStream()流式传输到Apache Spark。 然而,几个小时后总会有一个工作卡住(1/2),然后没有其他事情发生。当我在WebUI上单击(kill)时,Spark会崩溃。
直到昨天我使用Spark 2.2.0,然后我升级到2.2.1版本,希望它能解决我的问题,但事实并非如此。但是,通常需要3-4个小时才能发生这种情况,但今天需要23.7小时(我相信这是由于版本升级)。我在独立模式下运行Spark(在一台机器上)。
from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
import pprint
import sys
import os
sys.path.insert(0, '/home/servername/public_html/keywordproto/')
from mysettings import INTERVAL_SPARK_WINDOW, INTERVAL_SPARK_STREAM, kafkaServerEndPoint, kafkaTopic_A, kafkaTopic_B, logException
conf = SparkConf()
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, INTERVAL_SPARK_STREAM)
DS = KafkaUtils.createDirectStream(ssc, [kafkaTopic_A, kafkaTopic_B], {"metadata.broker.list": kafkaServerEndPoint})
root = DS.map(lambda tup: json.loads(tup[1].strip())) #kafka sends (k,v) tuple
news = root.filter(lambda js: js['type'] == 'news')
news_b = news.map(lambda js: (js['keyword'], 1)).reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), INTERVAL_SPARK_WINDOW)
social = root.filter(lambda js: js['type'] == 'social')
social_b = social.map(lambda js: (js['keyword'], 1)).reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), INTERVAL_SPARK_WINDOW)
def process_rdd_news(time, part_iterator):
import DAL
from mysettings import logException
print("----------- %s ----------- news --" % str(time))
# you may want to create a db connection here and pass it as a parameter in the loop
client = DAL.openConnection()
for part in part_iterator:
keyword = part[0]
nums = part[1]
DAL.store_mentions_news(client, nums, time, keyword)
news_b.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_news(time,parti) ))
def process_rdd_social(time, part_iterator):
import DAL
from mysettings import logException
print("----------- %s ----------- social --" % str(time))
# you may want to create a db connection here and pass it as a parameter in the loop
client = DAL.openConnection()
for part in part_iterator:
keyword = part[0]
nums = part[1]
DAL.store_mentions_social(client, nums, time, keyword)
social_b.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_social(time,parti) ))
def remove_nonASCII(str): return str.encode("ascii", errors="ignore").decode()
def socialMentionScore(body, html, ppr, nltk):
slag = html.unescape(body)
cleantxt = ppr.clean(slag) # removes urls, etc...
parsed_slag = ppr.parse(slag)
# ... arithmetic calculations
good = 1
return good #float : [0.0 ; 1.0]
def processAndValidateB(mention, keyword, DICT, nltk, dict_Keywords, logException, html, ppr):
retobj = { 'cats':{}, 'mention':mention, 'keeper':0, }
words = nltk.wordpunct_tokenize(mention)
catMatrix = {}
for cat in DICT.keys():
count = len(DICT[cat].intersection([w.lower() for w in words]))
if count > 0:
catMatrix[cat] = count
retobj['cats'] = catMatrix
retobj['mention'] = editedMention
lenw = len(words)
if len(set(dict_Keywords[keyword]).intersection([w.lower() for w in words])) <= 0:
elif socialMentionScore(editedMention, html, ppr, nltk) < 0.90:
return retobj
def processAndValidateA(title, mention, keyword, DICT, nltk, logException):
retobj = { 'cats':{}, 'keeper':0, }
mention = title + "\n" + mention
body_words = nltk.wordpunct_tokenize(mention)
title_words = nltk.wordpunct_tokenize(title)
catMatrix = {}
for cat in DICT.keys():
count = len(DICT[cat].intersection([w.lower() for w in body_words]))
if count > 0:
catMatrix[cat] = count
retobj['cats'] = catMatrix
lenw = len(title_words)
return retobj
def process_rdd_methodB(time, part_iterator):
# I believe Spark crashes because of this function
import DAL
from WordStatDict import DICT # this is a dict with 4k values
import nltk
from mysettings import dict_Keywords, logException
import html
import preprocessor as ppr
print("----------- %s ----------- methodB --" % str(time))
keepers = []
keyword_scores = {}
client = DAL.openConnection()
for part in part_iterator:
keyword = part["keyword"]
body = part["body"]
result = processAndValidateB(body, keyword, DICT, nltk, dict_Keywords, logException, html, ppr)
if not keyword in keyword_scores:
keyword_scores[keyword] = {}
for cat in result['cats']:
if not cat in keyword_scores[keyword]:
keyword_scores[keyword][cat] = 0
keyword_scores[keyword][cat] += result['cats'][cat]
if result['keeper'] == 1:
part["body"] = result['mention'] # the text has changed due to NLP
print("keepers: " + str(len(keepers)))
if len(keepers) > 0:
DAL.store_items_bulk(client, keepers, time);
if len(keyword_scores) > 0:
for keyword, slugs in keyword_scores.items():
if len(slugs) > 0:
DAL.store_scores_social(client, slugs, time, keyword);
social.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_methodB(time,parti) ))
def process_rdd_methodA(time, part_iterator):
# I believe Spark crashes because of this function
import DAL
from WordStatDict import DICT # this is a dict with 4k values
import nltk
from mysettings import logException
print("----------- %s ----------- methodA --" % str(time))
keepers = []
keyword_scores = {}
client = DAL.openConnection()
for part in part_iterator:
keyword = part["keyword"]
body = part["body"]
title = part["title"]
result = processAndValidateA(title, body, keyword, DICT, nltk, logException)
if not keyword in keyword_scores:
keyword_scores[keyword] = {}
for cat in result['cats']:
if not cat in keyword_scores[keyword]:
keyword_scores[keyword][cat] = 0
keyword_scores[keyword][cat] += result['cats'][cat]
if result['keeper'] == 1:
del part['body']
print("keepers: " + str(len(keepers)))
if len(keepers) > 0:
DAL.store_items_bulk(client, keepers, time);
if len(keyword_scores) > 0:
for keyword, slugs in keyword_scores.items():
if len(slugs) > 0:
DAL.store_scores_news(client, slugs, time, keyword);
news.foreachRDD(lambda time,rdd: rdd.foreachPartition(lambda parti:process_rdd_methodA(time,parti) ))
# http://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
# note: mongodb has a built-in connection pool
这就是我启动Apache Spark的方式:
def startProc(inp):
print(' '+" starting...")
subprocess.Popen(inp, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, preexec_fn=os.setpgrp) # PIPE / DEVNULL / STDOUT
startProc('spark-submit --driver-memory 1g --executor-memory 10g --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1 sparkConsumer.py')
我一直在试验很多参数,例如添加驱动程序和执行程序内存,但它没有做任何事情。 也许我的代码中存在内存泄漏,我尝试使用库并添加py文件,并在sparkConsumer.py中包含所有代码 - 但两者都没有任何区别。
从我的截图中可以看出,每个批次在2秒内完成,每20秒只有大约100-200条记录,这是一个非常低的数量。所以我不明白问题是什么...... 我也无法访问任何Spark日志,我找不到任何。但是我可以将stdout / stderr PIPE到一个文件,但是只在Spark启动时写入INFO日志和我自己的print(...)s。仅供参考,我正在使用PySpark。
也许Web UI是内存泄漏?它有很多条目,我想知道它们存储在哪里?我希望不要在我的记忆中...... 仅供参考我正在实时处理数据,所以我不需要长时间存储任何RDD,但Spark可能会将它们存储在内存(缓存或其他东西)中,导致它充斥JVM堆空间?但是我在任何地方都没有看到任何JVM OOM / Heap错误。
我还能尝试解决这个问题吗? 如果它是一个OOM问题,那么我可能会编写代码,每分钟发送数千条消息,这会加速工作失败。但我需要调试/分析系统的方法:)
答案 0 :(得分:0)
以下是有关情况的最新消息。 我有一台新服务器并安装了一个新的Spark 2.2.1版本(Kafka,Zookeeper)并迁移了我自己的文件。经过几个小时的测试和推文,我能够删除python文件中的一些内存泄漏(与Spark无关),但它们是OOM的原因。
不幸的是,Spark一直在崩溃。 有一次我在两台服务器上运行Spark,有趣的是,它们都在同一时间(同一小时,分钟,秒)“卡住”。这有多疯狂?只有当它们依赖于相同的东西时才会发生这种情况。 在我的例子中,我有Kafka生产者从外部API获取数据,然后我在Spark中使用这些数据。我尝试过各种各样的技巧让Spark自己崩溃,但我没有做过任何工作,Spark刚刚站稳脚跟。 因此,由于某些数据,它不太可能被卡住......
但由于Zookeeper和/或Kafka,它可能已经崩溃了。 Kafka每X分钟清理一次日志文件并更新偏移量。 我不确定它对Spark中的DirectStream有什么影响。但我有一种感觉,Kafka / Zookeeper是导致这些工作永远“卡住”的原因。
有关如何调试此人的任何想法? PS:现在我在Spark和JVM的GC(包括Kafka + Zookeeper)中启用了DEBUG级日志。发生卡住时,我将使用日志更新此主题。
答案 1 :(得分:0)
好的,昨天Spark&#34;卡住了#34;一个小时以后。 我已粘贴下面最相关的日志。 以下是Spark UI中卡住作业的截图:
2018-01-14 03:02:41 INFO MemoryStore:54 â<80><93> Block broadcast_1930 stored as values in memory (estimated size 16.6 KB, free 429.6 MB)
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Put block broadcast_1930 locally took 0 ms
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 releasing lock for broadcast_1930
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Putting block broadcast_1930 without replication took 1 ms
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 trying to put broadcast_1930_piece0
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 trying to acquire read lock for broadcast_1930_piece0
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 trying to acquire write lock for broadcast_1930_piece0
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 acquired write lock for broadcast_1930_piece0
2018-01-14 03:02:41 INFO MemoryStore:54 â<80><93> Block broadcast_1930_piece0 stored as bytes in memory (estimated size 8.7 KB, free 429.6 MB)
2018-01-14 03:02:41 INFO BlockManagerInfo:54 â<80><93> Added broadcast_1930_piece0 in memory on (size: 8.7 KB, free: 434.0 MB)
2018-01-14 03:02:41 DEBUG BlockManagerMaster:58 â<80><93> Updated info of block broadcast_1930_piece0
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Told master about block broadcast_1930_piece0
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Put block broadcast_1930_piece0 locally took 1 ms
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task -1024 releasing lock for broadcast_1930_piece0
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Putting block broadcast_1930_piece0 without replication took 1 ms
2018-01-14 03:02:41 INFO SparkContext:54 â<80><93> Created broadcast 1930 from broadcast at DAGScheduler.scala:1006
2018-01-14 03:02:41 INFO DAGScheduler:54 â<80><93> Submitting 2 missing tasks from ResultStage 1608 (PythonRDD[4989] at call at /usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/j ava_gateway.py:2230) (first 15 tasks are for partitions Vector(0, 1))
2018-01-14 03:02:41 INFO TaskSchedulerImpl:54 â<80><93> Adding task set 1608.0 with 2 tasks
2018-01-14 03:02:41 DEBUG TaskSetManager:58 â<80><93> Epoch for TaskSet 1608.0: 322
2018-01-14 03:02:41 DEBUG TaskSetManager:58 â<80><93> Valid locality levels for TaskSet 1608.0: ANY
2018-01-14 03:02:41 DEBUG TaskSchedulerImpl:58 â<80><93> parentName: , name: TaskSet_1608.0, runningTasks: 0
2018-01-14 03:02:41 INFO TaskSetManager:54 â<80><93> Starting task 0.0 in stage 1608.0 (TID 1928, localhost, executor driver, partition 0, ANY, 4741 bytes)
2018-01-14 03:02:41 INFO Executor:54 â<80><93> Running task 0.0 in stage 1608.0 (TID 1928)
2018-01-14 03:02:41 DEBUG Executor:58 â<80><93> Task 1928's epoch is 322
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Getting local block broadcast_1930
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task 1928 trying to acquire read lock for broadcast_1930
2018-01-14 03:02:41 TRACE BlockInfoManager:62 â<80><93> Task 1928 acquired read lock for broadcast_1930
2018-01-14 03:02:41 DEBUG BlockManager:58 â<80><93> Level for block broadcast_1930 is StorageLevel(disk, memory, deserialized, 1 replicas)
2018-01-14 03:02:41 INFO KafkaRDD:54 â<80><93> Beginning offset 639 is the same as ending offset skipping mentionsNews 0
2018-01-14 03:02:41 INFO PythonRunner:54 â<80><93> Times: total = 41, boot = -297, init = 338, finish = 0
2018-01-14 03:02:42 INFO PythonRunner:54 â<80><93> Times: total = 44, boot = -76, init = 120, finish = 0
2018-01-14 03:02:42 INFO PythonRunner:54 â<80><93> Times: total = 48, boot = -42, init = 90, finish = 0
2018-01-14 03:02:42 INFO PythonRunner:54 â<80><93> Times: total = 48, boot = 36, init = 9, finish = 3
2018-01-14 03:02:42 TRACE BlockInfoManager:62 â<80><93> Task 1928 releasing lock for broadcast_1930
2018-01-14 03:02:42 INFO Executor:54 â<80><93> Finished task 0.0 in stage 1608.0 (TID 1928). 1267 bytes result sent to driver
2018-01-14 03:02:42 DEBUG TaskSchedulerImpl:58 â<80><93> parentName: , name: TaskSet_1608.0, runningTasks: 0
2018-01-14 03:02:42 INFO TaskSetManager:54 â<80><93> Starting task 1.0 in stage 1608.0 (TID 1929, localhost, executor driver, partition 1, ANY, 4743 bytes)
2018-01-14 03:02:42 INFO TaskSetManager:54 â<80><93> Finished task 0.0 in stage 1608.0 (TID 1928) in 201 ms on localhost (executor driver) (1/2)
2018-01-14 03:02:42 INFO Executor:54 â<80><93> Running task 1.0 in stage 1608.0 (TID 1929)
2018-01-14 03:02:42 DEBUG Executor:58 â<80><93> Task 1929's epoch is 322
2018-01-14 03:02:42 DEBUG BlockManager:58 â<80><93> Getting local block broadcast_1930
2018-01-14 03:02:42 TRACE BlockInfoManager:62 â<80><93> Task 1929 trying to acquire read lock for broadcast_1930
2018-01-14 03:02:42 TRACE BlockInfoManager:62 â<80><93> Task 1929 acquired read lock for broadcast_1930
2018-01-14 03:02:42 DEBUG BlockManager:58 â<80><93> Level for block broadcast_1930 is StorageLevel(disk, memory, deserialized, 1 replicas)
2018-01-14 03:02:42 INFO KafkaRDD:54 â<80><93> Computing topic mentionsSocial, partition 0 offsets 4815532 -> 4815614
2018-01-14 03:02:42 INFO VerifiableProperties:68 â<80><93> Verifying properties
2018-01-14 03:02:42 INFO VerifiableProperties:68 â<80><93> Property group.id is overridden to
2018-01-14 03:02:42 INFO VerifiableProperties:68 â<80><93> Property zookeeper.connect is overridden to
2018-01-14 03:02:42 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from nevolin2:9092
2018-01-14 03:02:42 DEBUG BlockingChannel:52 â<80><93> Created socket with SO_TIMEOUT = 30000 (requested 30000), SO_RCVBUF = 65536 (requested 65536), SO_SNDBUF = 1313280 (requested -1), connectTimeoutMs = 30000.
2018-01-14 03:02:42 TRACE BoundedByteBufferSend:36 â<80><93> 66 bytes written.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 8192 bytes read.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 8192 bytes read.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 8192 bytes read.
2018-01-14 03:02:42 TRACE BoundedByteBufferReceive:36 â<80><93> 4314 bytes read.
2018-01-14 03:02:42 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from nevolin2:9092
2018-01-14 03:02:42 INFO PythonRunner:54 â<80><93> Times: total = 44, boot = -44, init = 87, finish = 1
2018-01-14 03:02:42 INFO PythonRunner:54 â<80><93> Times: total = 61, boot = 59, init = 0, finish = 2
2018-01-14 03:02:42 INFO PythonRunner:54 â<80><93> Times: total = 47, boot = -39, init = 85, finish = 1
2018-01-14 03:03:00 DEBUG RecurringTimer:58 â<80><93> Callback for JobGenerator called at time 1515898980000
2018-01-14 03:03:00 DEBUG JobGenerator:58 â<80><93> Got event GenerateJobs(1515898980000 ms)
2018-01-14 03:03:00 DEBUG DStreamGraph:58 â<80><93> Generating jobs for time 1515898980000 ms
2018-01-14 03:03:00 DEBUG PythonReducedWindowedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 INFO PythonTransformedDStream:54 â<80><93> Slicing from 1515898980000 ms to 1515898980000 ms (aligned to 1515898980000 ms and 1515898980000 ms)
2018-01-14 03:03:00 DEBUG PythonTransformedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG PythonTransformedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG PythonTransformedDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG DirectKafkaInputDStream:58 â<80><93> Time 1515898980000 ms is valid
2018-01-14 03:03:00 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from localhost:9092
2018-01-14 03:03:00 DEBUG BlockingChannel:52 â<80><93> Created socket with SO_TIMEOUT = 30000 (requested 30000), SO_RCVBUF = 65536 (requested 65536), SO_SNDBUF = 1313280 (requested -1), connectTimeoutMs = 30000.
2018-01-14 03:03:00 TRACE BoundedByteBufferSend:36 â<80><93> 48 bytes written.
2018-01-14 03:03:00 TRACE BoundedByteBufferReceive:36 â<80><93> 124 bytes read.
2018-01-14 03:03:00 DEBUG SimpleConsumer:52 â<80><93> Disconnecting from localhost:9092
从日志中我们可以看到时间03:02:42的最后一个条目是&#34; PythonRunner&#34;。 但是,在正常情况下,PythonRunner应该会发布一个&#34;释放锁定...&#34;声明,但从未发生过。它看起来要么陷入Python代码中,要么存在死锁。
- 编辑
py4j.protocol.Py4JJavaError: An error occurred while calling o32.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/streaming/util.py", line 65, in call
r = self.func(t, *rdds)
File "/home/nevolin/public_html/proto/consumers/sparkConsumer_A.py", line 209, in <lambda>
social.foreachRDD(lambda time,rdd: rdd_process(time, rdd, process_rdd_sentimentAnalysis_social) )
File "/home/nevolin/public_html/proto/consumers/sparkConsumer_A.py", line 39, in rdd_process
rdd.foreachPartition(lambda parti: func(time, parti))
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 799, in foreachPartition
self.mapPartitions(func).count() # Force evaluation
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1041, in count
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1032, in sum
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 906, in fold
vals = self.mapPartitions(func).collect()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 809, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job 964 cancelled
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1704)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)