USQL将IoT JSON数据有效地扁平化为CSV

时间:2018-12-11 12:46:39

标签: json azure query-performance azure-data-lake u-sql

我目前正在探索Azure Data Lake Analytics以对IoT数据进行批处理。 Json文件很小(20-30 Ko),并存储在“ deviceModel / deviceGeneration / deviceId / packageVersion / year / month / day / file.json”之类的路径中

JSON可能会随着更新而变化(更多/更少的捕获者)。所以我想创建一个通用脚本。

我已经写了2个版本,它们基于2个JSON结构执行我想要的操作,但是提取文件需要5s,这不是很有效。

版本1

JSON文件

Json文件由数组组成。数据数组通常由24个对象组成,但由于字符限制,我将其限制为2个。

{
"device_id": "2",
"timestamp": "1543622400",
"packageVersion": "pkg1",
"board": [
    {
        "name": "carteHaute",
        "firmware": "a26584re4r5er",
        "captors": [
            {
                "name": "capteur1",
                "id": "hcaptor1_2",
                "mesures": [
                    {
                        "name": "temperature",
                        "data": [
                            {
                                "value": 3,
                                "timestamp": 1543622400
                            },
                            {
                                "value": 4,
                                "timestamp": 1543626000
                            }
                        ]
                    }
                ]
            },
            {
                "name": "capteur2",
                "id": "hcaptor2_2",
                "mesures": [
                    {
                        "name": "pression",
                        "data": [
                            {
                                "value": 528,
                                "timestamp": 1543622400
                            },
                            {
                                "value": 561,
                                "timestamp": 1543626000
                            }
                        ]
                    },
                    {
                        "name": "temperature",
                        "data": [
                            {
                                "value": 42,
                                "timestamp": 1543622400
                            },
                            {
                                "value": 32,
                                "timestamp": 1543626000
                            }
                        ]
                    }
                ]
            }
        ]
    },
    {
        "name": "carteBasse",
        "firmware": "serser48df4e",
        "captors": [
            {
                "name": "capteur1",
                "id": "bcaptor1_2",
                "mesures": [
                    {
                        "name": "temperature",
                        "data": [
                            {
                                "value": 88,
                                "timestamp": 1543622400
                            },
                            {
                                "value": 106,
                                "timestamp": 1543626000
                            }
                        ]
                    }
                ]
            },
            {
                "name": "capteur2",
                "id": "bcaptor2_2",
                "mesures": [
                    {
                        "name": "co2",
                        "data": [
                            {
                                "value": 1374,
                                "timestamp": 1543622400
                            },
                            {
                                "value": 1417,
                                "timestamp": 1543626000
                            }
                        ]
                    }
                ]
            }
        ]
    }
]
}

提取脚本

REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
DECLARE @InputFile string = @"/BHTests/{model}/{generation}/{*}/{*}/{date:yyyy}/{date:MM}/{date:dd}/{*}.json";
DECLARE @OutputFile string = @"/Output/BHTests/resJson.csv";

@json =
    EXTRACT device_id string,
            timestamp string,
            packageVersion string,
            board string,
            model string,
            generation string,
            date DateTime

    FROM @InputFile
    USING new JsonExtractor();

@cartes =
    SELECT device_id,
           timestamp,
           packageVersion,
           JsonFunctions.JsonTuple(board).Values AS board_array,
           model,
           generation,
           date
    FROM @json
    WHERE model == "BH20" AND generation == "Gen4"
          AND date < DateTime.ParseExact("01/01/2019", "dd/MM/yyyy", null);
;

@cartes =
    SELECT device_id,
           timestamp,
           packageVersion,
           JsonFunctions.JsonTuple(bboard) ["name"]AS boardName,
           JsonFunctions.JsonTuple(bboard) ["firmware"]AS boardFirmware,
           JsonFunctions.JsonTuple(bboard) ["captors"] AS boardCaptors,
           model,
           generation
    FROM @cartes
         CROSS APPLY
             EXPLODE(board_array) AS b(bboard);

@captors =
    SELECT device_id,
           timestamp,
           packageVersion,
           boardName,
           boardFirmware,
           JsonFunctions.JsonTuple(boardCaptors).Values AS boardCaptorsArray,
           model,
           generation
    FROM @cartes;

@captors =
    SELECT device_id,
           timestamp,
           packageVersion,
           boardName,
           boardFirmware,
           JsonFunctions.JsonTuple(cboardCaptors) ["name"] AS captorName,
           JsonFunctions.JsonTuple(cboardCaptors) ["id"] AS captorId,
           JsonFunctions.JsonTuple(cboardCaptors) ["mesures"] AS captorMesures,
           model,
           generation
    FROM @captors
         CROSS APPLY
             EXPLODE(boardCaptorsArray) AS c(cboardCaptors);

@mesures =
    SELECT device_id,
           timestamp,
           packageVersion,
           boardName,
           boardFirmware,
           captorName,
           captorId,
           JsonFunctions.JsonTuple(captorMesures).Values AS captorMesuresArray,
           model,
           generation
    FROM @captors;

@mesures =
    SELECT device_id,
           timestamp,
           packageVersion,
           boardName,
           boardFirmware,
           captorName,
           captorId,
           JsonFunctions.JsonTuple(mcaptorMesures) ["name"] AS mesureName,
           JsonFunctions.JsonTuple(mcaptorMesures) ["data"] AS mesureData,
           model,
           generation
    FROM @mesures
         CROSS APPLY
             EXPLODE(captorMesuresArray) AS m(mcaptorMesures);

@data =
    SELECT device_id,
           timestamp,
           packageVersion,
           boardName,
           boardFirmware,
           captorName,
           captorId,
           mesureName,
           JsonFunctions.JsonTuple(mesureData).Values AS mesureDataArray,
           model,
           generation
    FROM @mesures;

@data =
    SELECT device_id,
           timestamp,
           model,
           generation,
           packageVersion,
           boardName,
           boardFirmware,
           captorName,
           captorId,
           mesureName,
           JsonFunctions.JsonTuple(dmesureData) ["value"] AS dataValue,
           JsonFunctions.JsonTuple(dmesureData) ["timestamp"] AS dataTimestamp

    FROM @data
         CROSS APPLY
             EXPLODE(mesureDataArray) AS d(dmesureData);


OUTPUT @data
TO "/Output/BHTests/dayGen2.csv"
ORDER BY device_id,
         timestamp,
         packageVersion,
         boardName,
        boardFirmware,
        captorName,
        captorId,
        mesureName,
        dataTimestamp
USING Outputters.Csv(outputHeader : true, quoting : false);

版本2

Json文件

{
    "device_id": "1",
    "timestamp": "1543622400",
    "packageVersion": "pkg1",
    "data": {
        "carteHaute": {
            "firmware": "a26584re4r5er",
            "capteur1": {
                "id": "hcaptor1_1",
                "data": {
                    "temperature": [
                        {
                            "value": 34,
                            "timestamp": 1543622400
                        },
                        {
                            "value": 40,
                            "timestamp": 1543626000
                        }
                    ]
                }
            },
            "capteur3": {
                "id": "hcaptor2_1",
                "data": {
                    "pression": [
                        {
                            "value": 688,
                            "timestamp": 1543622400
                        },
                        {
                            "value": 700,
                            "timestamp": 1543626000
                        }
                    ],
                    "temperature": [
                        {
                            "value": 40,
                            "timestamp": 1543622400
                        },
                        {
                            "value": 33,
                            "timestamp": 1543626000
                        }
                    ]
                }
            }
        },
        "carteBasse": {
            "firmware": "serser48df4e",
            "capteur1": {
                "id": "bcaptor1_1",
                "data": {
                    "temperature": [
                        {
                            "value": 80,
                            "timestamp": 1543622400
                        },
                        {
                            "value": 85,
                            "timestamp": 1543626000
                        }
                    ]
                }
            },
            "capteur2": {
                "id": "bcaptor1_1",
                "data": {
                    "co2": [
                        {
                            "value": 1251,
                            "timestamp": 1543622400
                        },
                        {
                            "value": 1345,
                            "timestamp": 1543626000
                        }
                    ]
                }
            }
        }
    }
}

提取脚本

REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
DECLARE @InputFile string = @"/BHTests/{model}/{generation}/{*}/{*}/{date:yyyy}/{date:MM}/{date:dd}/{*}.json";
DECLARE @OutputFile string = @"/Output/BHTests/resJson.csv";

@json =
    EXTRACT device_id string,
            timestamp string,
            packageVersion string,
            data string,
            model string,
            generation string,
            date DateTime

    FROM @InputFile
    USING new JsonExtractor();

@cartes =
    SELECT device_id,
        timestamp,
        packageVersion,
        JsonFunctions.JsonTuple(data) AS datamap,
        model,
        generation,
        date
    FROM @json
    WHERE model == "BH20" AND generation == "Gen1"
        AND date < DateTime.ParseExact("01/01/2019", "dd/MM/yyyy", null);
;

@cartes =
    SELECT device_id,
        timestamp,
        packageVersion,
        d.key AS boardName,
        d.value AS datamapvalue,
        model,
        generation,
        date
    FROM @cartes
    CROSS APPLY EXPLODE (datamap) AS d(key,value)
;

@cartes =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        JsonFunctions.JsonTuple(datamapvalue, "firmware").Values [0] AS boardFirmware,
        JsonFunctions.JsonTuple(datamapvalue) AS boardMap,
        model,
        generation,
        date
    FROM @cartes;

@captor =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        c.key AS captor,
        c.value AS captorValue,
        model,
        generation,
        date
    FROM @cartes
    CROSS APPLY EXPLODE (boardMap) AS c(key,value)

    WHERE c.key != "firmware"
;
@captor =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        JsonFunctions.JsonTuple(captorValue, "id").Values [0] AS captorId,
        JsonFunctions.JsonTuple(captorValue, "data") AS mesureDataMap,
        model,
        generation,
        date
    FROM @captor;


@mesure =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        captorId,
        m.key AS mesureKey,
        m.value AS mesureValue,
        model,
        generation,
        date
    FROM @captor
    CROSS APPLY EXPLODE(mesureDataMap) AS m(key,value);

@mesure =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        captorId,
        JsonFunctions.JsonTuple(mesureValue) AS mesureMap,
        model,
        generation,
        date
    FROM @mesure;


@mesure =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        captorId,
        M.key AS mesureName,
        M.value AS mesureValue,
        model,
        generation,
        date
    FROM @mesure
    CROSS APPLY EXPLODE(mesureMap) AS M(key,value);

@serie =
    SELECT device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        captorId,
        mesureName,
        JsonFunctions.JsonTuple(mesureValue).Values AS serieArray,
        model,
        generation,
        date
    FROM @mesure;


@serie =
    SELECT device_id,
        model,
        generation,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        captorId,
        mesureName,
        JsonFunctions.JsonTuple(svalue) ["value"] AS mesureValue,
        JsonFunctions.JsonTuple(svalue) ["timestamp"] AS mesureTimestamp/*,
        date*/
    FROM @serie
    CROSS APPLY EXPLODE(serieArray) AS s(svalue);

OUTPUT @serie
TO @"/Output/BHTests/dayGen1ChangeresBoard.csv"
ORDER BY device_id,
        timestamp,
        packageVersion,
        boardName,
        boardFirmware,
        captor,
        captorId,
        mesureName,
        mesureTimestamp
USING Outputters.Csv(outputHeader : true, quoting : false);

两个脚本的结果

device_id,model,generation,timestamp,packageVersion,boardName,boardFirmware,captor,captorId,mesureName,mesureValue,mesureTimestamp
1,BH20,Gen1,1543622400,pkg1,carteBasse,serser48df4e,capteur1,bcaptor1_1,temperature,80,1543622400
1,BH20,Gen1,1543622400,pkg1,carteBasse,serser48df4e,capteur1,bcaptor1_1,temperature,85,1543626000

我要问的是如何改善脚本的性能?因为我认为我在打电话给JsonFucntions并进行交叉应用爆炸太多。

我应该继续使用“通用”脚本还是应该编写特定的脚本?

0 个答案:

没有答案