Hortonworks Schema Registry + Nifi + Java:反序列化Nifi记录

时间:2020-02-19 13:33:26

标签: java apache-kafka apache-nifi avro hortonworks-dataflow

我正在尝试使用Hortonworks Schema Registry反序列化由Nifi序列化的一些Kafka消息

  • 在Nifi端用作RecordWritter的处理器:AvroRecordSetWriter
  • 模式编写策略:HWX内容编码模式参考

我能够在其他Nifi kafka消费者中反序列化这些消息。但是,我正在尝试使用Kafka代码从Flink应用程序中反序列化它们。

我的Flink应用程序的Kafka解串器处理程序中包含以下内容:

final String SCHEMA_REGISTRY_CACHE_SIZE_KEY = SchemaRegistryClient.Configuration.CLASSLOADER_CACHE_SIZE.name();
final String SCHEMA_REGISTRY_CACHE_EXPIRY_INTERVAL_SECS_KEY = SchemaRegistryClient.Configuration.CLASSLOADER_CACHE_EXPIRY_INTERVAL_SECS.name();
final String SCHEMA_REGISTRY_SCHEMA_VERSION_CACHE_SIZE_KEY = SchemaRegistryClient.Configuration.SCHEMA_VERSION_CACHE_SIZE.name();
final String SCHEMA_REGISTRY_SCHEMA_VERSION_CACHE_EXPIRY_INTERVAL_SECS_KEY = SchemaRegistryClient.Configuration.SCHEMA_VERSION_CACHE_EXPIRY_INTERVAL_SECS.name();
final String SCHEMA_REGISTRY_URL_KEY = SchemaRegistryClient.Configuration.SCHEMA_REGISTRY_URL.name();

Properties schemaRegistryProperties = new Properties();
schemaRegistryProperties.put(SCHEMA_REGISTRY_CACHE_SIZE_KEY, 10L);
schemaRegistryProperties.put(SCHEMA_REGISTRY_CACHE_EXPIRY_INTERVAL_SECS_KEY, 5000L);
schemaRegistryProperties.put(SCHEMA_REGISTRY_SCHEMA_VERSION_CACHE_SIZE_KEY, 1000L);
schemaRegistryProperties.put(SCHEMA_REGISTRY_SCHEMA_VERSION_CACHE_EXPIRY_INTERVAL_SECS_KEY, 60 * 60 * 1000L);
schemaRegistryProperties.put(SCHEMA_REGISTRY_URL_KEY, "http://schema_registry_server:7788/api/v1");
return (Map<String, Object>) HWXSchemaRegistry.getInstance(schemaRegistryProperties).deserialize(message);

下面是用于反序列化消息的HWXSchemaRegistryCode:

import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider;
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient;
import com.hortonworks.registries.schemaregistry.errors.SchemaNotFoundException;
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer;

public class HWXSchemaRegistry {

    private SchemaRegistryClient client;
    private Map<String,Object> config;
    private AvroSnapshotDeserializer deserializer;
    private static HWXSchemaRegistry hwxSRInstance = null;

    public static HWXSchemaRegistry getInstance(Properties schemaRegistryConfig) {
        if(hwxSRInstance == null)
            hwxSRInstance = new HWXSchemaRegistry(schemaRegistryConfig);
        return hwxSRInstance;
    }

    public Object deserialize(byte[] message) throws IOException {

        Object o = hwxSRInstance.deserializer.deserialize(new ByteArrayInputStream(message), null);
        return o;
   }

    private static Map<String,Object> properties2Map(Properties config) {
        Enumeration<Object> keys = config.keys();
        Map<String, Object> configMap = new HashMap<String,Object>();
        while (keys.hasMoreElements()) {
            Object key = (Object) keys.nextElement();
            configMap.put(key.toString(), config.get(key));
        }
        return configMap;
     }

    private HWXSchemaRegistry(Properties schemaRegistryConfig) {
        _log.debug("Init SchemaRegistry Client");
        this.config = HWXSchemaRegistry.properties2Map(schemaRegistryConfig);
        this.client = new SchemaRegistryClient(this.config);

        this.deserializer = this.client.getDefaultDeserializer(AvroSchemaProvider.TYPE);
        this.deserializer.init(this.config);
     }
}

但是我收到404 HTTP错误代码(找不到模式)。我认为这是由于Nifi配置与HWX Schema Registry Client实现之间不兼容的“协议”所致,因此服务器上不存在客户端正在寻找的模式标识符字节,或者类似的东西。

有人可以帮忙吗?

谢谢。

原因:javax.ws.rs.NotFoundException:找不到HTTP 404 在org.glassfish.jersey.client.JerseyInvocation.convertToException(JerseyInvocation.java:1069) 在org.glassfish.jersey.client.JerseyInvocation.translate(JerseyInvocation.java:866) 在org.glassfish.jersey.client.JerseyInvocation.lambda $ invoke $ 1(JerseyInvocation.java:750)中 在org.glassfish.jersey.internal.Errors.process(Errors.java:292) 在org.glassfish.jersey.internal.Errors.process(Errors.java:274) 在org.glassfish.jersey.internal.Errors.process(Errors.java:205) 在org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:390) 在org.glassfish.jersey.client.JerseyInvocation.invoke(JerseyInvocation.java:748) 在org.glassfish.jersey.client.JerseyInvocation $ Builder.method(JerseyInvocation.java:404) 在org.glassfish.jersey.client.JerseyInvocation $ Builder.get(JerseyInvocation.java:300) 在com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient $ 14.run(SchemaRegistryClient.java:1054) 在com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient $ 14.run(SchemaRegistryClient.java:1051) 在java.security.AccessController.doPrivileged(本机方法) 在javax.security.auth.Subject.doAs(Subject.java:360) 在com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient.getEntities(SchemaRegistryClient.java:1051) 在com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient.getAllVersions(SchemaRegistryClient.java:872) 在com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient.getAllVersions(SchemaRegistryClient.java:676) 在HWXSchemaRegistry。(HWXSchemaRegistry.java:56) 在HWXSchemaRegistry.getInstance(HWXSchemaRegistry.java:26) 在SchemaService.deserialize(SchemaService.java:70) 在SchemaService.deserialize(SchemaService.java:26) 在org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper.deserialize(KafkaDeserializationSchemaWrapper.java:45) 在org.apache.flink.streaming.connectors.kafka.internal.KafkaFetcher.runFetchLoop(KafkaFetcher.java:140) 在org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase.run(FlinkKafkaConsumerBase.java:712) 在org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:93) 在org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:57) 在org.apache.flink.streaming.runtime.tasks.SourceStreamTask.run(SourceStreamTask.java:97) 在org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:302) 在org.apache.flink.runtime.taskmanager.Task.run(Task.java:711) 在java.lang.Thread.run(Thread.java:745)

1 个答案:

答案 0 :(得分:0)

我找到了解决方法。由于我无法正常工作。我使用字节数组的第一个字节对架构注册表进行多次调用,并获得avro架构,以便稍后对该字节数组的其余部分反序列化。

  • 第一个字节(0)是协议版本(我发现这是Nifi专用字节,因为我不需要它)。
  • 接下来的8个字节是架构ID
  • 接下来的4个字节是架构版本
  • 其余字节是消息本身:

    import com.hortonworks.registries.schemaregistry.SchemaMetadataInfo;
    import com.hortonworks.registries.schemaregistry.SchemaVersionInfo;
    import com.hortonworks.registries.schemaregistry.SchemaVersionKey;
    import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient;
    
    try(SchemaRegistryClient client = new SchemaRegistryClient(this.schemaRegistryConfig)) {
        try {
            Long schemaId = ByteBuffer.wrap(Arrays.copyOfRange(message, 1, 9)).getLong();
            Integer schemaVersion =  ByteBuffer.wrap(Arrays.copyOfRange(message, 9, 13)).getInt();
    
    
            SchemaMetadataInfo schemaInfo = client.getSchemaMetadataInfo(schemaId);
            String schemaName = schemaInfo.getSchemaMetadata().getName();
    
            SchemaVersionInfo schemaVersionInfo = client.getSchemaVersionInfo(
                    new SchemaVersionKey(schemaName, schemaVersion));   
    
    
            String avroSchema = schemaVersionInfo.getSchemaText();
            byte[] message= Arrays.copyOfRange(message, 13, message.length);
            // Deserialize [...]
        } 
        catch (Exception e) 
        {
            throw new IOException(e.getMessage());
        }
    }
    

我还认为也许我必须在问题代码中调用hwxSRInstance.deserializer.deserialize之前删除第一个字节,因为该字节似乎是Nifi特定的字节,可以在Nifi处理器之间进行通信,但是它没有用

下一步是使用架构文本构建缓存,以避免多次调用架构注册表API。

新信息:我将扩大回答,以包括avro反序列化部分,因为这对我来说是一些故障排除,因此我必须检查Nifi Avro Reader源代码以弄清楚这一部分(当我遇到无效的Avro数据异常时,尝试使用基本的Avro反序列化代码):

import org.apache.avro.Schema;
import org.apache.avro.file.SeekableByteArrayInput;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;

private static GenericRecord deserializeMessage(byte[] message, String schemaText) throws IOException {

    InputStream in = new SeekableByteArrayInput(message);
    Schema schema = new Schema.Parser().parse(schemaText);
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema);
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(in,  null);
    GenericRecord genericRecord = null;
    genericRecord = datumReader.read(genericRecord, decoder);
    in.close();

    return genericRecord;
}

如果要将GenericRecord转换为映射,请注意String值不是Strings对象,则需要强制转换Keys和类型为string的值:

private static Map<String, Object> avroGenericRecordToMap(GenericRecord record)
{
    Map<String, Object> map = new HashMap<>();
    record.getSchema().getFields().forEach(field -> 
        map.put(String.valueOf(field.name()), record.get(field.name())));

    // Strings are maped to Utf8 class, so they need to be casted (all the keys of records and those values which are typed as string)
    if(map.get("value").getClass() ==  org.apache.avro.util.Utf8.class)
        map.put("value", String.valueOf(map.get("value")));

    return map;
}