在C#中读取事件中心归档文件

时间:2017-05-16 06:08:54

标签: c# azure avro azure-eventhub

C#中是否有用于读取Azure事件中心归档文件(Avro格式)的示例代码?

我正在尝试使用Microsoft.Hadoop.Avro库。我使用java avro工具抛弃了模式,产生了这个:

{

                ""type"":""record"",
                ""name"":""EventData"",
                ""namespace"":""Microsoft.ServiceBus.Messaging"",
                ""fields"":[
                             {""name"":""SequenceNumber"",""type"":""long""},
                             {""name"":""Offset"",""type"":""string""},
                             {""name"":""EnqueuedTimeUtc"",""type"":""string""},
                             {""name"":""SystemProperties"",""type"":{ ""type"":""map"",""values"":[""long"",""double"",""string"",""bytes""]}},
                             {""name"":""Properties"",""type"":{ ""type"":""map"",""values"":[""long"",""double"",""string"",""bytes"", ""null""]}},
                             {""name"":""Body"",""type"":[""null"",""bytes""]}
                         ]
                }

但是,当尝试反序列化文件以重新读取数据时:

using (var reader = AvroContainer.CreateReader<EventData>(stream))
            {
                using (var streamReader = new SequentialReader<EventData>(reader))
                {
                    foreach (EventData dta in streamReader.Objects)
                    {
                        //stuff here
                    }

                }
            }

在传递Producer端使用的实际EventData类型时它不起作用,所以我尝试创建一个标记有DataContract属性的特殊类,如下所示:

[DataContract(Namespace = "Microsoft.ServiceBus.Messaging")]
public class EventData
{
    [DataMember(Name = "SequenceNumber")]
    public long SequenceNumber { get; set; }

    [DataMember(Name = "Offset")]
    public string Offset { get; set; }

    [DataMember(Name = "EnqueuedTimeUtc")]
    public string EnqueuedTimeUtc { get; set; }

    [DataMember(Name = "Body")]
    public ArraySegment<byte> Body { get; set; }

    //[DataMember(Name = "SystemProperties")]
    //public SystemPropertiesCollection SystemProperties { get; set; }

    //[DataMember(Name = "Properties")]
    //public IDictionary<string, object> Properties { get; set; }
}

错误如下:

System.Runtime.Serialization.SerializationException occurred
Message=Cannot match the union schema.

对于使用C#读取Avro存档文件的用例,MS是否存在没有示例代码的原因?

2 个答案:

答案 0 :(得分:1)

如果您尝试使用Microsoft.Hadoop.Avro库读取Avro文件,则可以使用以下类:

[DataContract(Name = "EventData", Namespace = "Microsoft.ServiceBus.Messaging")]
class EventData
{
    [DataMember(Name = "SequenceNumber")]
    public long SequenceNumber { get; set; }

    [DataMember(Name = "Offset")]
    public string Offset { get; set; }

    [DataMember(Name = "EnqueuedTimeUtc")]
    public DateTime EnqueuedTimeUtc { get; set; }

    [DataMember(Name = "SystemProperties")]
    public Dictionary<string, object> SystemProperties { get; set; }

    [DataMember(Name = "Properties")]
    public Dictionary<string, object> Properties { get; set; } 

    [DataMember(Name = "Body")]
    public byte[] Body { get; set; }

    public EventData(dynamic record)
    {
        SequenceNumber = (long)record.SequenceNumber;
        Offset = (string)record.Offset;
        DateTime.TryParse((string)record.EnqueuedTimeUtc, out var enqueuedTimeUtc);
        EnqueuedTimeUtc = enqueuedTimeUtc;
        SystemProperties = (Dictionary<string, object>)record.SystemProperties;
        Properties = (Dictionary<string, object>)record.Properties;
        Body = (byte[])record.Body;
    }

}

当您阅读avro文件时,可以将其作为动态对象读取,然后对其进行序列化。这是一个例子:

var reader = AvroContainer.CreateGenericReader(stream);
while (reader.MoveNext()) 
{
   foreach (dynamic record in reader.Current.Objects)
   {
       var eventData = new EventData(record);
       var sequenceNumber = eventData.SequenceNumber;
       var bodyText = Encoding.UTF8.GetString(eventData.Body);
       var properties = eventData.Properties;
       var sysProperties = eventData.SystemProperties;
   }
}

您可以参考this answer了解更多详情。

答案 1 :(得分:0)

我使用了Microsoft.Hadoop.Avro和apache avro C#libs,他们似乎有同样的问题。当只是尝试读取序列,偏移量和EnqueuedTimeUTC时,它们都会得到相同的乱码数据,这些数据似乎是编解码器和模式定义数据。所以这就是我发现的。我正在将blob下载到内存流,然后尝试从那里反序列化。问题是反序列化器没有考虑文件中的头和模式,并试图从流的最开始反序列化。

要解决这个问题,可以使用Apache Avro C#库并使用他们的gen工具根据转储的json格式化模式创建C#类,然后使用可以从流中读取的DataFileReader。

using (var dataFileReader = Avro.File.DataFileReader<EventData>.OpenReader(stream, evtSample.Schema))

其中evtSample.Schema是包含它的架构的EventData类的实例。

现在要了解我是否可以使用Microsoft.Hadoop.Avro库做同样的事情。

BTW,这是Apache AVRO gen工具生成的C#类输出:

public partial class EventData : ISpecificRecord
{
    public static Schema _SCHEMA = Avro.Schema.Parse(@"{""type"":""record"",""name"":""EventData"",""namespace"":""Microsoft.ServiceBus.Messaging"",""fields"":[{""name"":""SequenceNumber"",""type"":""long""},{""name"":""Offset"",""type"":""string""},{""name"":""EnqueuedTimeUtc"",""type"":""string""},{""name"":""SystemProperties"",""type"":{""type"":""map"",""values"":[""long"",""double"",""string"",""bytes""]}},{""name"":""Properties"",""type"":{""type"":""map"",""values"":[""long"",""double"",""string"",""bytes"",""null""]}},{""name"":""Body"",""type"":[""null"",""bytes""]}]}");
    private long _SequenceNumber;
    private string _Offset;
    private string _EnqueuedTimeUtc;
    private IDictionary<string, System.Object> _SystemProperties;
    private IDictionary<string, System.Object> _Properties;
    private byte[] _Body;
    public virtual Schema Schema
    {
        get
        {
            return EventData._SCHEMA;
        }
    }
    public long SequenceNumber
    {
        get
        {
            return this._SequenceNumber;
        }
        set
        {
            this._SequenceNumber = value;
        }
    }
    public string Offset
    {
        get
        {
            return this._Offset;
        }
        set
        {
            this._Offset = value;
        }
    }
    public string EnqueuedTimeUtc
    {
        get
        {
            return this._EnqueuedTimeUtc;
        }
        set
        {
            this._EnqueuedTimeUtc = value;
        }
    }
    public IDictionary<string, System.Object> SystemProperties
    {
        get
        {
            return this._SystemProperties;
        }
        set
        {
            this._SystemProperties = value;
        }
    }
    public IDictionary<string, System.Object> Properties
    {
        get
        {
            return this._Properties;
        }
        set
        {
            this._Properties = value;
        }
    }
    public byte[] Body
    {
        get
        {
            return this._Body;
        }
        set
        {
            this._Body = value;
        }
    }
    public virtual object Get(int fieldPos)
    {
        switch (fieldPos)
        {
            case 0: return this.SequenceNumber;
            case 1: return this.Offset;
            case 2: return this.EnqueuedTimeUtc;
            case 3: return this.SystemProperties;
            case 4: return this.Properties;
            case 5: return this.Body;
            default: throw new AvroRuntimeException("Bad index " + fieldPos + " in Get()");
        };
    }
    public virtual void Put(int fieldPos, object fieldValue)
    {
        switch (fieldPos)
        {
            case 0: this.SequenceNumber = (System.Int64)fieldValue; break;
            case 1: this.Offset = (System.String)fieldValue; break;
            case 2: this.EnqueuedTimeUtc = (System.String)fieldValue; break;
            case 3: this.SystemProperties = (IDictionary<string, System.Object>)fieldValue; break;
            case 4: this.Properties = (IDictionary<string, System.Object>)fieldValue; break;
            case 5: this.Body = (System.Byte[])fieldValue; break;
            default: throw new AvroRuntimeException("Bad index " + fieldPos + " in Put()");
        };
    }
}

}