Question

目录中有大量文件，每行中都有json格式的条目。文件大小从5k到200MB不等。我有这个代码来遍历每个文件，解析我在json中寻找的数据，最后形成一个数据框。这个脚本需要很长时间才能完成，实际上它永远不会完成。

有没有办法加快速度，以便我能更快地读取文件？

代码：

library(jsonlite)
library(data.table) 

setwd("C:/Files/")

#data <- lapply(readLines("test.txt"), fromJSON)

df<-data.frame(Timestamp=factor(),Source=factor(),Host=factor(),Status=factor())
filenames <- list.files("Json_files", pattern="*.txt", full.names=TRUE)

for(i in filenames){
  print(i)
  data <- lapply(readLines(i), fromJSON)
  myDf <- do.call("rbind", lapply(data, function(d) { 
    data.frame(TimeStamp = d$payloadData$timestamp, 
               Source = d$payloadData$source, 
               Host = d$payloadData$host, 
               Status = d$payloadData$status)}))

  df<-rbind(df,myDf)

}

这是一个示例条目，但文件中有数千个这样的条目：

{"senderDateTimeStamp":"2016/04/08 10:53:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB01","servermember":"test"},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}

{"senderDateTimeStamp":"2016/04/08 10:54:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}

{"senderDateTimeStamp":"2016/04/08 10:55:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}

Answer 1

您的示例数据位于“c：/tmp.txt”：

> df <- jsonlite::fromJSON(paste0("[",paste0(readLines("c:/tmp.txt"),collapse=","),"]"))$payloadData[c("timestamp","source","host","status")]
> df
                timestamp source  host status
1 2016-04-08T10:53:18.169 STREAM WEB01    get
2 2016-04-08T10:53:18.169 STREAM WEB02    get
3 2016-04-08T10:53:18.169 STREAM WEB02    get

因此，要调整代码以获取数据帧列表：

dflist <- lapply(filenames, function(i) {
  jsonlite::fromJSON(
    paste0("[",
            paste0(readLines(i),collapse=","),
            "]")
  )$payloadData[c("timestamp","source","host","status")]
})

我们的想法是将您的行（从readLines）转换为大型json数组，然后通过将其解析为json来创建数据帧。

正如lmo已经展示的那样，在你的filenmaes列表上使用lapply会为你提供一个数据帧列表，如果你真的只想要一个数据帧，你可以加载data.table个包然后使用rbindlist dflist只能获得一个数据帧。

或者，如果记忆力不足，this thread可能对您有所帮助。

Answer 2

一次加快是将ListView循环替换为for然后删除最终的lapply。这里的速度将是R不必重复复制越来越大的文件，df超过你的＆＃34;束＆＃34;的文件。结果将存储在一个方便的列表中，您可以按原样使用它，也可以一次性转换为data.frame：

rbind

是否可以在R中处理文件读取和解析

2 个答案: