我目前正在探索Azure Data Lake Analytics以对IoT数据进行批处理。 Json文件很小(20-30 Ko),并存储在“ deviceModel / deviceGeneration / deviceId / packageVersion / year / month / day / file.json”之类的路径中
JSON可能会随着更新而变化(更多/更少的捕获者)。所以我想创建一个通用脚本。
我已经写了2个版本,它们基于2个JSON结构执行我想要的操作,但是提取文件需要5s,这不是很有效。
Json文件由数组组成。数据数组通常由24个对象组成,但由于字符限制,我将其限制为2个。
{
"device_id": "2",
"timestamp": "1543622400",
"packageVersion": "pkg1",
"board": [
{
"name": "carteHaute",
"firmware": "a26584re4r5er",
"captors": [
{
"name": "capteur1",
"id": "hcaptor1_2",
"mesures": [
{
"name": "temperature",
"data": [
{
"value": 3,
"timestamp": 1543622400
},
{
"value": 4,
"timestamp": 1543626000
}
]
}
]
},
{
"name": "capteur2",
"id": "hcaptor2_2",
"mesures": [
{
"name": "pression",
"data": [
{
"value": 528,
"timestamp": 1543622400
},
{
"value": 561,
"timestamp": 1543626000
}
]
},
{
"name": "temperature",
"data": [
{
"value": 42,
"timestamp": 1543622400
},
{
"value": 32,
"timestamp": 1543626000
}
]
}
]
}
]
},
{
"name": "carteBasse",
"firmware": "serser48df4e",
"captors": [
{
"name": "capteur1",
"id": "bcaptor1_2",
"mesures": [
{
"name": "temperature",
"data": [
{
"value": 88,
"timestamp": 1543622400
},
{
"value": 106,
"timestamp": 1543626000
}
]
}
]
},
{
"name": "capteur2",
"id": "bcaptor2_2",
"mesures": [
{
"name": "co2",
"data": [
{
"value": 1374,
"timestamp": 1543622400
},
{
"value": 1417,
"timestamp": 1543626000
}
]
}
]
}
]
}
]
}
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
DECLARE @InputFile string = @"/BHTests/{model}/{generation}/{*}/{*}/{date:yyyy}/{date:MM}/{date:dd}/{*}.json";
DECLARE @OutputFile string = @"/Output/BHTests/resJson.csv";
@json =
EXTRACT device_id string,
timestamp string,
packageVersion string,
board string,
model string,
generation string,
date DateTime
FROM @InputFile
USING new JsonExtractor();
@cartes =
SELECT device_id,
timestamp,
packageVersion,
JsonFunctions.JsonTuple(board).Values AS board_array,
model,
generation,
date
FROM @json
WHERE model == "BH20" AND generation == "Gen4"
AND date < DateTime.ParseExact("01/01/2019", "dd/MM/yyyy", null);
;
@cartes =
SELECT device_id,
timestamp,
packageVersion,
JsonFunctions.JsonTuple(bboard) ["name"]AS boardName,
JsonFunctions.JsonTuple(bboard) ["firmware"]AS boardFirmware,
JsonFunctions.JsonTuple(bboard) ["captors"] AS boardCaptors,
model,
generation
FROM @cartes
CROSS APPLY
EXPLODE(board_array) AS b(bboard);
@captors =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
JsonFunctions.JsonTuple(boardCaptors).Values AS boardCaptorsArray,
model,
generation
FROM @cartes;
@captors =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
JsonFunctions.JsonTuple(cboardCaptors) ["name"] AS captorName,
JsonFunctions.JsonTuple(cboardCaptors) ["id"] AS captorId,
JsonFunctions.JsonTuple(cboardCaptors) ["mesures"] AS captorMesures,
model,
generation
FROM @captors
CROSS APPLY
EXPLODE(boardCaptorsArray) AS c(cboardCaptors);
@mesures =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captorName,
captorId,
JsonFunctions.JsonTuple(captorMesures).Values AS captorMesuresArray,
model,
generation
FROM @captors;
@mesures =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captorName,
captorId,
JsonFunctions.JsonTuple(mcaptorMesures) ["name"] AS mesureName,
JsonFunctions.JsonTuple(mcaptorMesures) ["data"] AS mesureData,
model,
generation
FROM @mesures
CROSS APPLY
EXPLODE(captorMesuresArray) AS m(mcaptorMesures);
@data =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captorName,
captorId,
mesureName,
JsonFunctions.JsonTuple(mesureData).Values AS mesureDataArray,
model,
generation
FROM @mesures;
@data =
SELECT device_id,
timestamp,
model,
generation,
packageVersion,
boardName,
boardFirmware,
captorName,
captorId,
mesureName,
JsonFunctions.JsonTuple(dmesureData) ["value"] AS dataValue,
JsonFunctions.JsonTuple(dmesureData) ["timestamp"] AS dataTimestamp
FROM @data
CROSS APPLY
EXPLODE(mesureDataArray) AS d(dmesureData);
OUTPUT @data
TO "/Output/BHTests/dayGen2.csv"
ORDER BY device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captorName,
captorId,
mesureName,
dataTimestamp
USING Outputters.Csv(outputHeader : true, quoting : false);
{
"device_id": "1",
"timestamp": "1543622400",
"packageVersion": "pkg1",
"data": {
"carteHaute": {
"firmware": "a26584re4r5er",
"capteur1": {
"id": "hcaptor1_1",
"data": {
"temperature": [
{
"value": 34,
"timestamp": 1543622400
},
{
"value": 40,
"timestamp": 1543626000
}
]
}
},
"capteur3": {
"id": "hcaptor2_1",
"data": {
"pression": [
{
"value": 688,
"timestamp": 1543622400
},
{
"value": 700,
"timestamp": 1543626000
}
],
"temperature": [
{
"value": 40,
"timestamp": 1543622400
},
{
"value": 33,
"timestamp": 1543626000
}
]
}
}
},
"carteBasse": {
"firmware": "serser48df4e",
"capteur1": {
"id": "bcaptor1_1",
"data": {
"temperature": [
{
"value": 80,
"timestamp": 1543622400
},
{
"value": 85,
"timestamp": 1543626000
}
]
}
},
"capteur2": {
"id": "bcaptor1_1",
"data": {
"co2": [
{
"value": 1251,
"timestamp": 1543622400
},
{
"value": 1345,
"timestamp": 1543626000
}
]
}
}
}
}
}
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
USING Microsoft.Analytics.Samples.Formats.Json;
DECLARE @InputFile string = @"/BHTests/{model}/{generation}/{*}/{*}/{date:yyyy}/{date:MM}/{date:dd}/{*}.json";
DECLARE @OutputFile string = @"/Output/BHTests/resJson.csv";
@json =
EXTRACT device_id string,
timestamp string,
packageVersion string,
data string,
model string,
generation string,
date DateTime
FROM @InputFile
USING new JsonExtractor();
@cartes =
SELECT device_id,
timestamp,
packageVersion,
JsonFunctions.JsonTuple(data) AS datamap,
model,
generation,
date
FROM @json
WHERE model == "BH20" AND generation == "Gen1"
AND date < DateTime.ParseExact("01/01/2019", "dd/MM/yyyy", null);
;
@cartes =
SELECT device_id,
timestamp,
packageVersion,
d.key AS boardName,
d.value AS datamapvalue,
model,
generation,
date
FROM @cartes
CROSS APPLY EXPLODE (datamap) AS d(key,value)
;
@cartes =
SELECT device_id,
timestamp,
packageVersion,
boardName,
JsonFunctions.JsonTuple(datamapvalue, "firmware").Values [0] AS boardFirmware,
JsonFunctions.JsonTuple(datamapvalue) AS boardMap,
model,
generation,
date
FROM @cartes;
@captor =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
c.key AS captor,
c.value AS captorValue,
model,
generation,
date
FROM @cartes
CROSS APPLY EXPLODE (boardMap) AS c(key,value)
WHERE c.key != "firmware"
;
@captor =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
JsonFunctions.JsonTuple(captorValue, "id").Values [0] AS captorId,
JsonFunctions.JsonTuple(captorValue, "data") AS mesureDataMap,
model,
generation,
date
FROM @captor;
@mesure =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
captorId,
m.key AS mesureKey,
m.value AS mesureValue,
model,
generation,
date
FROM @captor
CROSS APPLY EXPLODE(mesureDataMap) AS m(key,value);
@mesure =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
captorId,
JsonFunctions.JsonTuple(mesureValue) AS mesureMap,
model,
generation,
date
FROM @mesure;
@mesure =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
captorId,
M.key AS mesureName,
M.value AS mesureValue,
model,
generation,
date
FROM @mesure
CROSS APPLY EXPLODE(mesureMap) AS M(key,value);
@serie =
SELECT device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
captorId,
mesureName,
JsonFunctions.JsonTuple(mesureValue).Values AS serieArray,
model,
generation,
date
FROM @mesure;
@serie =
SELECT device_id,
model,
generation,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
captorId,
mesureName,
JsonFunctions.JsonTuple(svalue) ["value"] AS mesureValue,
JsonFunctions.JsonTuple(svalue) ["timestamp"] AS mesureTimestamp/*,
date*/
FROM @serie
CROSS APPLY EXPLODE(serieArray) AS s(svalue);
OUTPUT @serie
TO @"/Output/BHTests/dayGen1ChangeresBoard.csv"
ORDER BY device_id,
timestamp,
packageVersion,
boardName,
boardFirmware,
captor,
captorId,
mesureName,
mesureTimestamp
USING Outputters.Csv(outputHeader : true, quoting : false);
device_id,model,generation,timestamp,packageVersion,boardName,boardFirmware,captor,captorId,mesureName,mesureValue,mesureTimestamp
1,BH20,Gen1,1543622400,pkg1,carteBasse,serser48df4e,capteur1,bcaptor1_1,temperature,80,1543622400
1,BH20,Gen1,1543622400,pkg1,carteBasse,serser48df4e,capteur1,bcaptor1_1,temperature,85,1543626000
我要问的是如何改善脚本的性能?因为我认为我在打电话给JsonFucntions并进行交叉应用爆炸太多。
我应该继续使用“通用”脚本还是应该编写特定的脚本?