导入多个JSON文件并使用R

时间:2017-09-06 18:46:09

标签: json r tidyr jsonlite

我正在努力将多个JSON文件拉入R

我的JSON数据看起来像这样(我只显示了2个样本,但有近800个样本)

{
  "ID": 9, 
  "BCOUNT": 800,
  "MEASUREMENT": [ 
    {
      "MEAS_ID": 1,
      "PDATETIME": "2017-01-14 16:00:59", 
      "STATUS": "Pass",
      "PROCESS_SAMPLES": [ 
        {
          "NUMBER": 1, 
          "LENGTH": 31.5, 
          "HEIGHT": 30.9, 
          "WIDTH": 80.91,  
          "BREADTH": 54 
        },
        {
          "NUMBER": 2, 
          "LENGTH": 41.5, 
          "HEIGHT": 40.9, 
          "WIDTH": 60.91,  
          "BREADTH": 74 
        }
      ]
    }
  ]
}

我有接近100个文件,每个文件超过~15 MB。我试图在R中转换它并做一些分析。

目标是根据STATUS

中的时间序列数据预测PROCESS SAMPLES

我试图通过这种方式将JSON格式转换为R数据帧,以便获得时间序列数据集。

set.seed(12345)
path = "~/data"

packages <- c("jsonlite", "dplyr", "purrr","tidyjson","tidyr","data.table")
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)

temp <- data.frame(list.files(path, pattern="*.json", full.names=TRUE))
colnames(temp)[which(names(temp) == "list.files.path..pattern......json...full.names...TRUE.")] <- "filename"
temp$filename <- as.character(temp$filename)

ls<-list() 

for (j in 1:(nrow(temp)))
{
  cat(j,"\n")

  df <- temp$filename[[j]] %>%
    spread_values(ID = jstring("ID")) %>%
    enter_object("MEASUREMENT") %>% gather_array %>%
    spread_values(MEAS_ID = jnumber("MEAS_ID"), 
                  STATUS = jstring("STATUS"), 
                  PDATETIME = jstring("PDATETIME")) %>%         
    enter_object("PROCESS_SAMPLES") %>% gather_array %>%       
    spread_values(NUMBER = jnumber("NUMBER"),LENGTH = jnumber("LENGTH"),HEIGHT = jnumber("HEIGHT")
                  ,WIDTH = jnumber("WIDTH"),BREADTH = jnumber("BREADTH")) %>%
    select(ID,MEAS_ID,STATUS,PDATETIME,NUMBER,LENGTH,HEIGHT,WIDTH,BREADTH)

  ls[[j]] <- unique(df)
}

df_samples =  do.call("rbind.fill", ls)

此代码需要很长时间才能处理~100个文件。我怎样才能加快这个过程?以及如何进行预测STATUS

的分析

有人能指出我正确的方向吗?

1 个答案:

答案 0 :(得分:0)

content <- list( ' {   "ID": 9,   "BCOUNT": 800,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-14 16:00:59",   "STATUS": "Pass",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 31.5,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 2,   "LENGTH": 41.5,   "HEIGHT": 40.9,   "WIDTH": 60.91,    "BREADTH": 74   }   ]   }   ]   } '
                 ,
                 ' {   "ID": 10,   "BCOUNT": 900,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-15 16:00:59",   "STATUS": "Pass",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 31.5,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 1,   "LENGTH": 33.5,   "HEIGHT": 34.9,   "WIDTH": 92.91,    "BREADTH": 12   },   {   "NUMBER": 2,   "LENGTH": 41.5,   "HEIGHT": 40.9,   "WIDTH": 60.91,    "BREADTH": 74   }   ]   }   ]   } '
                 ,
                 ' {   "ID": 11,   "BCOUNT": 900,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-16 16:00:59",   "STATUS": "Fail",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 100,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 1,   "LENGTH": 120,   "HEIGHT": 34.9,   "WIDTH": 92.91,    "BREADTH": 12   },   {   "NUMBER": 2,   "LENGTH": 130,   "HEIGHT": 40.9,   "WIDTH": 60.91,    "BREADTH": 74   }   ]   }   ]   } ' 
                 ,
                 ' {   "ID": 12,   "BCOUNT": 900,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-17 16:00:59",   "STATUS": "Fail",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 220,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 1,   "LENGTH": 200,   "HEIGHT": 34.9,   "WIDTH": 92.91,    "BREADTH": 12   }   ]   }   ]   } ' 
)
result <- do.call(plyr::rbind.fill, lapply(content, function(js){
  dat<-jsonlite::fromJSON(js)
  do.call(cbind, 
          list(
            ID=dat$ID,
            BCOUNT=dat$BCOUNT,
            { df <- dat$MEASUREMENT[,c("MEAS_ID","PDATETIME","STATUS")]
              rownames(df) <- NULL
              df
            },
            {
              df<-dat$MEASUREMENT[["PROCESS_SAMPLES"]][[1]]
              rownames(df) <- NULL
              df
            }  
          )
  )

}))
result$PDATETIME<-as.POSIXct(result$PDATETIME)
result$STATUS<-as.numeric(factor(result$STATUS, levels=c("Pass","Fail")))
fit <- glm(STATUS ~ ., data=result) #Don't actually use this! You have to experiment with different models to find out what works..
predict(fit, result[c(1,10),]) #Don't actually do this either -- you have to create  training, tuning and testing sets.