使用R将JSON文件转换为CSV文件

时间:2016-06-23 16:12:58

标签: json r csv

我在.txt文件中有JSON文件,我试图加载到R中但是我收到以下错误:

/* 0 */
{
  "_id" : "93ccbdb6-8947",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "93ccbdb6-8947",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 1 */
{
  "_id" : "b736c374-b8ae",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "b736c374-b8ae",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 2 */
{
  "_id" : "3312605f-8304",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1SXE",
    "queryId" : "3312605f-8304",
    "subRequests" : [{
        "origin" : "LON",
        "destination" : "IAD",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 2,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 3 */
{
  "_id" : "6b668cfa-9b79",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1NXA",
    "queryId" : "6b668cfa-9b79",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 1,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 4 */
{
  "_id" : "41c373a1-e4cb",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP6CXS",
    "queryId" : "41c373a1-e4cb",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 5 */
{
  "_id" : "2c8331c4-21ca",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "2c8331c4-21ca",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 6 */
{
  "_id" : "71a09900-1c13",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP6CXS",
    "queryId" : "71a09900-1c13",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AF",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "DL",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "LH",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "BA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 6941,
  "status" : "SUCCESS"
}

/* 7 */
{
  "_id" : "a036a42a-918b",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1MMM",
    "queryId" : "a036a42a-918b",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 8 */
{
  "_id" : "c547be36-805c",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1SXB",
    "queryId" : "c547be36-805c",
    "subRequests" : [{
        "origin" : "CHI",
        "destination" : "LON",
        "carrier" : "BA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 2,
  "requestDate" : 20151205,
  "totalRecords" : 1072,
  "status" : "SUCCESS"
}

我假设错误是因为/ *(数字)* /的多个实例而我无法手动删除它们,因为我的文件有10k个实例。有没有办法在将数据加载到R?

之前删除此类实例

我的JSON文件如下所示:

library(jsonlite)
library(RJSONIO)
json_data_raw<-fromJSON("mydata.txt")

json_file <- lapply(json_data_raw, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

output <-- do.call("rbind", json_file)
write.csv(a, file="json.csv",row.names = FALSE)
file.show("json.csv")

我的代码在下面(虽然我还没有走得太远):

CREATE TABLE #DeleteString ( AddressB4 VARCHAR(255) );

INSERT  INTO #DeleteString
        ( AddressB4 )
VALUES  ( '11000 PLACIDA RD BLDG 15 UNIT 1504'  -- AddressB4 - varchar(255)
          ),
        ( '17200 ACAPULCO RD BLDG 1'  -- AddressB4 - varchar(255)
          );
DECLARE @SearchStrng VARCHAR(255)= 'BLDG';
SELECT  REPLACE(AddressB4,
                SUBSTRING(AddressB4,
                          CHARINDEX(@SearchStrng, AddressB4)
                          + LEN(@SearchStrng), LEN(AddressB4)), '') AS FilteredAddress
FROM    #DeleteString;

我试图将输出转换为CSV文件,如下所示

I'm trying to get output into a CSV file like below

2 个答案:

答案 0 :(得分:5)

您的文本文件存在多个问题。正如您已经注意到的,您需要删除/* 0 */形式的行。什么结果仍然无效json。如果要在文件中包含多个json对象,则需要将它们存储在数组中。 json对象是在花括号中关闭的部分,例如,

{
  "_id" : "93ccbdb6-8947-4687-8e12-edf4e40d6650",
  ...
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

对象数组的结构如下:

[
  {
    ...
  },
  {
    ...
  }
]

要使文件保持形状,您需要在对象之间添加逗号并添加方括号。你可以这样做:

raw <- readLines("mydata.txt")

# get rid of the "/* 0 */" lines
json <- grep("^/\\* [0-9]* \\*/", raw, value = TRUE, invert = TRUE)

# add missing comma after }
n <- length(json)
json[-n] <- gsub("^}$", "},", json[-n])

# add brakets at the beginning and end
json <- c("[", json, "]")

这可以由fromJSON()读取,所以我认为它是有效的json:

library(jsonlite)
table <- fromJSON(json)

该表是嵌套的,也就是说,某些表单元格本身包含数据框或列表。例如,

table[1,2]
##   travelDate travelDuration shopperDuration oneWay  userId                              queryId
## 1   20151206              7              30  FALSE ATP1KKP 93ccbdb6-8947-4687-8e12-edf4e40d6650
##               subRequests
##     1 WAS, LON, AA, , 1,2

您可以使用flatten()包中的jsonlite来获得一个嵌套级别较低的表格

flatten(table)[1:3, c(1, 6, 12)]
##                                    _id uiSearchRequest.travelDate uiSearchRequest.subRequests
## 1 93ccbdb6-8947-4687-8e12-edf4e40d6650                   20151206         WAS, LON, AA, , 1,2
## 2 b736c374-b8ae-4e99-8073-9c54517fecd5                   20151206         WAS, LON, AA, , 1,2
## 3 3312605f-8304-4ab8-96d6-6e1a03cfbd9e                   20151206         LON, IAD, AA, , 1,2

最后一列仍然是一个列表。有很多方法可以解决这个问题。一种可能性是为每个子请求创建一行,其中重复所有其他列(X_iddownloadCount等)的内容。 (这几乎就是你在问题中给出的形式,唯一不同的是你在重新列出的列中留下了空格,而我重复了内容。)这是如何做到的:

table <- flatten(fromJSON(json))
tab_list <- lapply(1:nrow(table),
                  function(i) data.frame(table[i, -12], table[i, 12],
                              stringsAsFactors = FALSE))
library(dplyr)
flat_table <- bind_rows(tab_list)

第二行创建数据框列表。使用bind_rows()中的dpylr将这些数据框合并为一个数据框。 (更准确地说,flat_table将是tbl_df,但与data.frame的差异很小。)然后可以通常的方式将其写入csv文件:

write.csv(flat_table, file = "mydata.csv")

答案 1 :(得分:0)

在Python中非常简单:

import pandas as pd
data = pd.read_json(path_to_input_file)
data.to_csv(path_to_csv_output_file)