我必须从本地/ hdfs / kafka读取gz文件,然后解压缩并解析它。谁有这方面的经验?
或者其他类型喜欢bin.tar.gz
答案 0 :(得分:0)
您可以使用sc.binaryFiles
读取二进制文件,并使用内容字节执行任何您喜欢的操作。
至于tar.gz,请参阅Read whole text files from a compression in Spark
答案 1 :(得分:0)
这就是我所做的: 1.读取二进制数据= sc.binaryFiles(path) 2.提取内容
data = (data
.map(lambda x: (x[0], ungzip(x[1])))
)
def ungzip(df):
compressed_file = io.BytesIO(df)
decompressed_file = gzip.GzipFile(fileobj=compressed_file)
return decompressed_file.read()
def _VarintDecoder(mask):
local_ord = ord
def DecodeVarint(buffer, pos):
result = 0
shift = 0
while 1:
if pos > len(buffer) - 1:
raise NotEnoughDataExcption("Not enough data to decode varint")
b = local_ord(buffer[pos])
result |= ((b & 0x7f) << shift)
pos += 1
if not (b & 0x80):
result &= mask
return (result, pos)
shift += 7
if shift >= 64:
raise ValueError('Too many bytes when decoding varint.')
return DecodeVarint
def parse_binary(data):
decoder = _VarintDecoder((1 << 64) - 1)
next_pos, pos = 0, 0
messages = []
try:
while 1:
next_pos, pos = decoder(data[1], pos)
messages.append((data[0], data[1][pos:pos + next_pos]))
pos += next_pos
except:
return messages
data = (data
.flatMap(lambda x: parse_binary(x))
)
在此之后你每行有一个protobuf消息,你可以并行应用你的protobuf_parsing函数