从CSV文件中提取JSON数据

时间:2017-12-06 13:46:58

标签: json r csv

我正在尝试提取一个JSON数据,这是CSV文件中的一列。到目前为止,我已经到了以正确格式提取列的位置,但格式化仅在变量类型为因子时才是正确的。但我无法使用jsonlite包将因子转换为json文件。

[1] {"id":509746197991998767,"visibility":{"percentage":100,"time":149797,"visible1":true,"visible2":false,"visible3":false,"activetab":true},"interaction":{"mouseovercount":1,"mouseovertime":1426,"videoplaytime":0,"engagementtime":0,"expandtime":0,"exposuretime":35192}}

另一种方法是在导入时使用stringsAsFactors = F,但我正在努力使格式正确,每个条目看起来像这样:

[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"

我错过了一些明显的东西吗?我只是想要提取CSV文件中的JSON文件。

下面是CSV文件的一个小例子:

"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"

此致

弗雷德里克。

3 个答案:

答案 0 :(得分:1)

df <- readr::read_csv('"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"',
col_types = "-cc")

在每个单独的值上使用jsonlite::fromJSON,然后tidyr::unnest

library(dplyr)

f <- function(.x) 
  if (is.na(.x) || .x == "") data.frame()[1, ] else 
    as.data.frame(jsonlite::fromJSON(.x))

df %>% 
  tidyr::unnest(UnloadVars = lapply(UnloadVars, f)) %>% 
  mutate_at(vars(ends_with("id")), as.character)

# A tibble: 6 x 16
#               CookieID                  id visibility.percentage visibility.time visibility.visible1 visibility.visible2 visibility.visible3 visibility.activetab interaction.mouseovercount interaction.mouseovertime interaction.videoplaytime interaction.engagementtime interaction.expandtime interaction.exposuretime clicks.x clicks.y
#                  <chr>               <chr>                 <int>           <int>               <lgl>               <lgl>               <lgl>                <lgl>                      <int>                     <int>                     <int>                      <int>                  <int>                    <int>    <int>    <int>
# 1 -8857188784608690176  509746197991998784                   100          149797                TRUE               FALSE               FALSE                 TRUE                          1                      1426                         0                          0                      0                    35192       NA       NA
# 2 -1695626857458244096 2917654329769114112                    46               0               FALSE               FALSE               FALSE                 TRUE                         NA                        NA                        NA                         NA                     NA                       NA       NA       NA
# 3   437299165071669184 2252707957388071936                    99           10168                TRUE               FALSE               FALSE                 TRUE                          0                         0                         0                          0                      0                      542      105      449
# 4   292660729552227520                <NA>                    NA              NA                  NA                  NA                  NA                   NA                         NA                        NA                        NA                         NA                     NA                       NA       NA       NA
# 5  7036383942916227072 2299674593327687168                    76            1145                TRUE               FALSE               FALSE                 TRUE                          0                         0                         0                          0                      0                       74      197      135
# 6  7036383942916227072 2299674593327687168                    76            1145                TRUE               FALSE               FALSE                 TRUE                          0                         0                         0                          0                      0                       74      197      135

答案 1 :(得分:0)

我使用readr :: read_csv读取您的示例数据集。

> df <- readr::read_csv('~/sample.csv')
Parsed with column specification:
cols(
  CookieID = col_double(),
  UnloadVars = col_character()
)

正如您所看到的,UnloadVars是作为字符而非因素读入的。如果我现在检查UnloadVars列中的第一个值,我会看到以下内容与您得到的内容相匹配,

> df$UnloadVars[1]
[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"

现在,我使用jsonlite :: fromJSON,

> j <- jsonlite::fromJSON(df$UnloadVars[1])
> j
$id
[1] 5.097462e+17

$visibility
$visibility$percentage
[1] 100

$visibility$time
[1] 149797

$visibility$visible1
[1] TRUE

$visibility$visible2
[1] FALSE

$visibility$visible3
[1] FALSE

$visibility$activetab
[1] TRUE


$interaction
$interaction$mouseovercount
[1] 1

$interaction$mouseovertime
[1] 1426

$interaction$videoplaytime
[1] 0

$interaction$engagementtime
[1] 0

$interaction$expandtime
[1] 0

$interaction$exposuretime
[1] 35192

我认为你需要的是因为JSON被解析为R中的列表。

答案 2 :(得分:0)

处理JSON数据可能非常棘手。作为一般指导,您应始终努力将数据放在数据框中。然而,这并非总是可行的。在特定情况下,我没有看到在格式良好的数据框中同时拥有visibilityinteraction值的方法。

接下来我要做的是将interaction中的信息提取到数据框中。

加载所需的包并读取数据

library(purrr)
library(dplyr)
library(tidyr)
df <- read.csv("sample.csv", stringsAsFactors = FALSE)

然后删除无效的JSON

# remove rows without JSON (in this case, the 4th row)
df <- df %>% 
  dplyr::filter(UnloadVars != "")

将每个JSON转换为列表并将其放入UnloadVars列。如果您不知道,可以在数据框中包含列表列。这非常有用。

out <- data_frame(CookieID = numeric(), 
                  UnloadVars = list())

for (row in 1:nrow(df)) {
  new_row <- data_frame(CookieID = df[row, ]$CookieID,
                        UnloadVars = list(jsonlite::fromJSON(df[row, ]$UnloadVars)))

  out <- bind_rows(out, new_row)
}

out

我们现在可以从Unload Vars中的列表中提取ID。这很简单,因为每个列表只有一个ID。

out <- out %>% 
  mutate(id = map_chr(UnloadVars, ~ .$id))

这最后一部分看起来有点令人生畏。但我在这里做的是从UnloadVars列中获取交互部分并将其放入interaction列。然后,我将interaction(列表)中的每一行转换为包含两列的数据框:keyvaluekey包含互动指标的名称及其value的值。我终于把它取消了,所以我们摆脱了列表列,最后得到了一个格式很好的数据框。

unpack_list <- function(obj, key_name) {
  as.data.frame(obj) %>% 
    gather(key) %>% 
    return()
}

df_interaction <- out %>% 
    mutate(interaction = map(UnloadVars, ~ .$interaction)) %>% 
    mutate(interaction = map(interaction, ~ unpack_list(.x, key))) %>% 
    unnest(interaction)

df_interaction

解决方案不是很优雅,但可以完成工作。您可以应用相同的逻辑从可见性中提取信息。