在Kafka connect hdfs中,我们具有下面的SequenceFileWriter.java类,用于以SequenceFileFormat编写kafka消息。
import java.io.IOException;
import io.confluent.connect.avro.AvroData;
import io.confluent.connect.hdfs.RecordWriter;
import io.confluent.connect.hdfs.RecordWriterProvider;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.kafka.connect.sink.SinkRecord;
/**
* Provider of a Sequence File record writer.
*/
public class SequenceFileWriterProvider implements RecordWriterProvider
{
public String getExtension() {
return "";
}
@Override
public RecordWriter<SinkRecord> getRecordWriter(Configuration conf, String fileName, SinkRecord record, AvroData avroData) throws IOException {
Path path = new Path(fileName);
final SequenceFile.Writer writer;
SequenceFile.Writer.Option optPath = SequenceFile.Writer.file(path);
SequenceFile.Writer.Option optKey = SequenceFile.Writer.keyClass(LongWritable.class);
SequenceFile.Writer.Option optVal = SequenceFile.Writer.valueClass(Text.class);
SequenceFile.Writer.Option optCodec = SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new BZip2Codec());
writer = SequenceFile.createWriter(conf, optPath, optKey, optVal, optCodec);
return new RecordWriter<SinkRecord>() {
@Override
public void write(SinkRecord record) throws IOException {
writer.append(
new LongWritable(System.currentTimeMillis()),
new Text((byte[]) record.value())
);
}
@Override
public void close() throws IOException {
writer.close();
}
};
}
}
我们在由kubernetes管理的docker容器中运行confluent 5.0.0。我们观察到,当我们在运行kafka连接器的k8s中删除复制控制器并重新创建复制控制器时,某些序列文件会损坏。我们有一个spark作业,它使用SequenceFileReader读取此数据并接收下面的EOFException。还观察到文件末尾有两个额外的字节。我们猜测SequenceFileWriter出现问题,需要帮助验证Writer。任何帮助,将不胜感激。谢谢。
java.io.EOFException
at java.io.DataInputStream.readByte(DataInputStream.java:267)
at org.apache.hadoop.io.WritableUtils.readVLong(WritableUtils.java:308)
at org.apache.hadoop.io.WritableUtils.readVInt(WritableUtils.java:329)
at org.apache.hadoop.io.SequenceFile$Reader.readBuffer(SequenceFile.java:2160)
at org.apache.hadoop.io.SequenceFile$Reader.seekToCurrentValue(SequenceFile.java:2227)
at org.apache.hadoop.io.SequenceFile$Reader.getCurrentValue(SequenceFile.java:2263)
at org.apache.hadoop.io.SequenceFile$Reader.next(SequenceFile.java:2394)
at badSequenceFile.readSequenceFile(badSequenceFile.java:27)
at badSequenceFile.main(badSequenceFile.java:345)
注意:当我们在启动k8s复制控制器之前删除连接器临时文件(+ tmp)时,连接器将启动干净并且不创建错误文件。
答案 0 :(得分:0)
修改writer.append
以处理异常看起来已经解决了不写入带有错误文件尾(EOF)标记的不良序列文件的问题。
此外,还执行了将记录值从字节类型转换为String数据类型的操作。
return new RecordWriter<SinkRecord>() {
@Override
public void write(SinkRecord record) {
if (record != null) {
byte[] text = (byte[]) record.value();
try{
writer.append(
new LongWritable(System.currentTimeMillis()),
new Text(new String (text))
);
} catch (Exception e) {
logger.error("Exception encounterd : "+e+" for text : "+text);
}
}
}
}