我有一个pyspark应用程序,可以从Kinesis加载数据并保存到S3。
每批处理时间都相当稳定,但随后就会卡住。 我怎么能弄清楚它为什么会发生?
代码示例:
columns = [x.name for x in schema]
Event = Row(*[x[0] for x in columns])
def get_spark_session_instance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf=sparkConf) \
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def creating_func():
def timing(message):
print('timing', str(datetime.utcnow()), message)
def process_game(df, game, time_part):
# s3
df.write.json("{}/{}/{}/{}".format(path_prefix, game, 'group_1', time_part),
compression="gzip", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSS")
timing('{}_grop_1'.format(game))
df[df['group'] == 2] \
.write.json("{}/{}/{}/{}".format(path_prefix, game, 'group_2', time_part),
compression="gzip", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSS")
timing('{}_grop_2'.format(game))
# database
df[df['group'] == 3].select(*db_columns) \
.write.jdbc(db_connection_string, table="test.{}group_3".format(game), mode='append',
properties=db_connection_propetries)
timing('{}_db'.format(game))
def event_to_row(event):
event_dict = json.loads(event)
event_dict['json_data'] = event_dict.get('json_data') and json.dumps(
event_dict.get('json_data'))
return Event(*[event_dict.get(x) for x in columns])
def process(rdd):
if not rdd.isEmpty():
spark_time = datetime.utcnow().strftime('%Y/%m/%d/%H/%M%S_%f')
rows_rdd = rdd.map(event_to_row)
spark = get_spark_session_instance(rdd.context.getConf())
df = spark.createDataFrame(data=rows_rdd, schema=schema)
df = df.withColumn("ts", df["ts"].cast(TimestampType())) \
.withColumn("processing_time", lit(datetime.utcnow()))
df.cache()
print('timing -----------------------------')
process_game(df[df['app_id'] == 1], 'app_1', spark_time)
process_game(df[df['app_id'] == 2], 'app_2', spark_time)
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 240)
kinesis_stream = KinesisUtils.createStream(
ssc, sys.argv[2], 'My-stream-name', "kinesis.us-east-1.amazonaws.com",
'us-east-1', InitialPositionInStream.TRIM_HORIZON, 240, StorageLevel.MEMORY_AND_DISK_2)
kinesis_stream.repartition(16 * 3).foreachRDD(process)
ssc.checkpoint(checkpoint_prefix + sys.argv[1])
return ssc
if __name__ == '__main__':
print('timing', 'cast ts', str(datetime.utcnow()))
ssc = StreamingContext.getActiveOrCreate(checkpoint_prefix + sys.argv[1], creating_func)
ssc.start()
ssc.awaitTermination()
答案 0 :(得分:0)
识别花费时间的进程,使用kill -QUIT或jstack来获取堆栈跟踪。查看源代码中的可能延迟,并考虑在哪里可以增加log4j日志记录以获取更多信息。
延迟会随着写入的数据量而增加吗?如果是这样,那就是通常的“重命名是真正的复制”问题s3已经