我尝试通过PySpark将实时Kafka数据摄取配置到HBase中。我在使用下面的代码时遇到问题,单个执行器(900+)中会有太多zk连接,这导致我的zk集群崩溃并打开了太多文件
def process_rdd(t, rdd):
if config.isDirectModel() and config.isOnUserOffsetManage():
offsetRanges = rdd.offsetRanges()
offset_counter = GetOffsetCounter(config)
offset_counter.save_offset(config["_group_id"], config["Version"], offsetRanges)
print "save kafka offset done!"
hbase_conf_ = config.get('HBaseConf', {})
hbase_conf = RenderConfigAttribute(hbase_conf_, "hbase.mapred.outputtable")
rdd.map(lambda event: convertor(event, config, 'hbase_reformat')).\
filter(lambda x: x is not None).\
flatMap(lambda x: x).\
saveAsNewAPIHadoopDataset(
keyConverter=("org.apache.spark.examples.pythonconverters."
"StringToImmutableBytesWritableConverter"),
valueConverter=("org.apache.spark.examples.pythonconverters."
"StringListToPutConverter"),
conf=hbase_conf)
kafka_stream.foreachRDD(process_rdd)