我正在努力将多个JSON文件拉入R
我的JSON数据看起来像这样(我只显示了2个样本,但有近800个样本)
{
"ID": 9,
"BCOUNT": 800,
"MEASUREMENT": [
{
"MEAS_ID": 1,
"PDATETIME": "2017-01-14 16:00:59",
"STATUS": "Pass",
"PROCESS_SAMPLES": [
{
"NUMBER": 1,
"LENGTH": 31.5,
"HEIGHT": 30.9,
"WIDTH": 80.91,
"BREADTH": 54
},
{
"NUMBER": 2,
"LENGTH": 41.5,
"HEIGHT": 40.9,
"WIDTH": 60.91,
"BREADTH": 74
}
]
}
]
}
我有接近100个文件,每个文件超过~15 MB。我试图在R中转换它并做一些分析。
目标是根据STATUS
PROCESS SAMPLES
我试图通过这种方式将JSON格式转换为R数据帧,以便获得时间序列数据集。
set.seed(12345)
path = "~/data"
packages <- c("jsonlite", "dplyr", "purrr","tidyjson","tidyr","data.table")
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
temp <- data.frame(list.files(path, pattern="*.json", full.names=TRUE))
colnames(temp)[which(names(temp) == "list.files.path..pattern......json...full.names...TRUE.")] <- "filename"
temp$filename <- as.character(temp$filename)
ls<-list()
for (j in 1:(nrow(temp)))
{
cat(j,"\n")
df <- temp$filename[[j]] %>%
spread_values(ID = jstring("ID")) %>%
enter_object("MEASUREMENT") %>% gather_array %>%
spread_values(MEAS_ID = jnumber("MEAS_ID"),
STATUS = jstring("STATUS"),
PDATETIME = jstring("PDATETIME")) %>%
enter_object("PROCESS_SAMPLES") %>% gather_array %>%
spread_values(NUMBER = jnumber("NUMBER"),LENGTH = jnumber("LENGTH"),HEIGHT = jnumber("HEIGHT")
,WIDTH = jnumber("WIDTH"),BREADTH = jnumber("BREADTH")) %>%
select(ID,MEAS_ID,STATUS,PDATETIME,NUMBER,LENGTH,HEIGHT,WIDTH,BREADTH)
ls[[j]] <- unique(df)
}
df_samples = do.call("rbind.fill", ls)
此代码需要很长时间才能处理~100个文件。我怎样才能加快这个过程?以及如何进行预测STATUS
有人能指出我正确的方向吗?
答案 0 :(得分:0)
content <- list( ' { "ID": 9, "BCOUNT": 800, "MEASUREMENT": [ { "MEAS_ID": 1, "PDATETIME": "2017-01-14 16:00:59", "STATUS": "Pass", "PROCESS_SAMPLES": [ { "NUMBER": 1, "LENGTH": 31.5, "HEIGHT": 30.9, "WIDTH": 80.91, "BREADTH": 54 }, { "NUMBER": 2, "LENGTH": 41.5, "HEIGHT": 40.9, "WIDTH": 60.91, "BREADTH": 74 } ] } ] } '
,
' { "ID": 10, "BCOUNT": 900, "MEASUREMENT": [ { "MEAS_ID": 1, "PDATETIME": "2017-01-15 16:00:59", "STATUS": "Pass", "PROCESS_SAMPLES": [ { "NUMBER": 1, "LENGTH": 31.5, "HEIGHT": 30.9, "WIDTH": 80.91, "BREADTH": 54 }, { "NUMBER": 1, "LENGTH": 33.5, "HEIGHT": 34.9, "WIDTH": 92.91, "BREADTH": 12 }, { "NUMBER": 2, "LENGTH": 41.5, "HEIGHT": 40.9, "WIDTH": 60.91, "BREADTH": 74 } ] } ] } '
,
' { "ID": 11, "BCOUNT": 900, "MEASUREMENT": [ { "MEAS_ID": 1, "PDATETIME": "2017-01-16 16:00:59", "STATUS": "Fail", "PROCESS_SAMPLES": [ { "NUMBER": 1, "LENGTH": 100, "HEIGHT": 30.9, "WIDTH": 80.91, "BREADTH": 54 }, { "NUMBER": 1, "LENGTH": 120, "HEIGHT": 34.9, "WIDTH": 92.91, "BREADTH": 12 }, { "NUMBER": 2, "LENGTH": 130, "HEIGHT": 40.9, "WIDTH": 60.91, "BREADTH": 74 } ] } ] } '
,
' { "ID": 12, "BCOUNT": 900, "MEASUREMENT": [ { "MEAS_ID": 1, "PDATETIME": "2017-01-17 16:00:59", "STATUS": "Fail", "PROCESS_SAMPLES": [ { "NUMBER": 1, "LENGTH": 220, "HEIGHT": 30.9, "WIDTH": 80.91, "BREADTH": 54 }, { "NUMBER": 1, "LENGTH": 200, "HEIGHT": 34.9, "WIDTH": 92.91, "BREADTH": 12 } ] } ] } '
)
result <- do.call(plyr::rbind.fill, lapply(content, function(js){
dat<-jsonlite::fromJSON(js)
do.call(cbind,
list(
ID=dat$ID,
BCOUNT=dat$BCOUNT,
{ df <- dat$MEASUREMENT[,c("MEAS_ID","PDATETIME","STATUS")]
rownames(df) <- NULL
df
},
{
df<-dat$MEASUREMENT[["PROCESS_SAMPLES"]][[1]]
rownames(df) <- NULL
df
}
)
)
}))
result$PDATETIME<-as.POSIXct(result$PDATETIME)
result$STATUS<-as.numeric(factor(result$STATUS, levels=c("Pass","Fail")))
fit <- glm(STATUS ~ ., data=result) #Don't actually use this! You have to experiment with different models to find out what works..
predict(fit, result[c(1,10),]) #Don't actually do this either -- you have to create training, tuning and testing sets.