我正在尝试提取一个JSON数据,这是CSV文件中的一列。到目前为止,我已经到了以正确格式提取列的位置,但格式化仅在变量类型为因子时才是正确的。但我无法使用jsonlite
包将因子转换为json文件。
[1] {"id":509746197991998767,"visibility":{"percentage":100,"time":149797,"visible1":true,"visible2":false,"visible3":false,"activetab":true},"interaction":{"mouseovercount":1,"mouseovertime":1426,"videoplaytime":0,"engagementtime":0,"expandtime":0,"exposuretime":35192}}
另一种方法是在导入时使用stringsAsFactors = F
,但我正在努力使格式正确,每个条目看起来像这样:
[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"
我错过了一些明显的东西吗?我只是想要提取CSV文件中的JSON
文件。
下面是CSV文件的一个小例子:
"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"
此致
弗雷德里克。
答案 0 :(得分:1)
df <- readr::read_csv('"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"',
col_types = "-cc")
在每个单独的值上使用jsonlite::fromJSON
,然后tidyr::unnest
library(dplyr)
f <- function(.x)
if (is.na(.x) || .x == "") data.frame()[1, ] else
as.data.frame(jsonlite::fromJSON(.x))
df %>%
tidyr::unnest(UnloadVars = lapply(UnloadVars, f)) %>%
mutate_at(vars(ends_with("id")), as.character)
# A tibble: 6 x 16
# CookieID id visibility.percentage visibility.time visibility.visible1 visibility.visible2 visibility.visible3 visibility.activetab interaction.mouseovercount interaction.mouseovertime interaction.videoplaytime interaction.engagementtime interaction.expandtime interaction.exposuretime clicks.x clicks.y
# <chr> <chr> <int> <int> <lgl> <lgl> <lgl> <lgl> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 -8857188784608690176 509746197991998784 100 149797 TRUE FALSE FALSE TRUE 1 1426 0 0 0 35192 NA NA
# 2 -1695626857458244096 2917654329769114112 46 0 FALSE FALSE FALSE TRUE NA NA NA NA NA NA NA NA
# 3 437299165071669184 2252707957388071936 99 10168 TRUE FALSE FALSE TRUE 0 0 0 0 0 542 105 449
# 4 292660729552227520 <NA> NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# 5 7036383942916227072 2299674593327687168 76 1145 TRUE FALSE FALSE TRUE 0 0 0 0 0 74 197 135
# 6 7036383942916227072 2299674593327687168 76 1145 TRUE FALSE FALSE TRUE 0 0 0 0 0 74 197 135
答案 1 :(得分:0)
我使用readr :: read_csv读取您的示例数据集。
> df <- readr::read_csv('~/sample.csv')
Parsed with column specification:
cols(
CookieID = col_double(),
UnloadVars = col_character()
)
正如您所看到的,UnloadVars是作为字符而非因素读入的。如果我现在检查UnloadVars列中的第一个值,我会看到以下内容与您得到的内容相匹配,
> df$UnloadVars[1]
[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"
现在,我使用jsonlite :: fromJSON,
> j <- jsonlite::fromJSON(df$UnloadVars[1])
> j
$id
[1] 5.097462e+17
$visibility
$visibility$percentage
[1] 100
$visibility$time
[1] 149797
$visibility$visible1
[1] TRUE
$visibility$visible2
[1] FALSE
$visibility$visible3
[1] FALSE
$visibility$activetab
[1] TRUE
$interaction
$interaction$mouseovercount
[1] 1
$interaction$mouseovertime
[1] 1426
$interaction$videoplaytime
[1] 0
$interaction$engagementtime
[1] 0
$interaction$expandtime
[1] 0
$interaction$exposuretime
[1] 35192
我认为你需要的是因为JSON被解析为R中的列表。
答案 2 :(得分:0)
处理JSON数据可能非常棘手。作为一般指导,您应始终努力将数据放在数据框中。然而,这并非总是可行的。在特定情况下,我没有看到在格式良好的数据框中同时拥有visibility
和interaction
值的方法。
接下来我要做的是将interaction
中的信息提取到数据框中。
加载所需的包并读取数据
library(purrr)
library(dplyr)
library(tidyr)
df <- read.csv("sample.csv", stringsAsFactors = FALSE)
然后删除无效的JSON
# remove rows without JSON (in this case, the 4th row)
df <- df %>%
dplyr::filter(UnloadVars != "")
将每个JSON转换为列表并将其放入UnloadVars
列。如果您不知道,可以在数据框中包含列表列。这非常有用。
out <- data_frame(CookieID = numeric(),
UnloadVars = list())
for (row in 1:nrow(df)) {
new_row <- data_frame(CookieID = df[row, ]$CookieID,
UnloadVars = list(jsonlite::fromJSON(df[row, ]$UnloadVars)))
out <- bind_rows(out, new_row)
}
out
我们现在可以从Unload Vars
中的列表中提取ID。这很简单,因为每个列表只有一个ID。
out <- out %>%
mutate(id = map_chr(UnloadVars, ~ .$id))
这最后一部分看起来有点令人生畏。但我在这里做的是从UnloadVars
列中获取交互部分并将其放入interaction
列。然后,我将interaction
(列表)中的每一行转换为包含两列的数据框:key
和value
。 key
包含互动指标的名称及其value
的值。我终于把它取消了,所以我们摆脱了列表列,最后得到了一个格式很好的数据框。
unpack_list <- function(obj, key_name) {
as.data.frame(obj) %>%
gather(key) %>%
return()
}
df_interaction <- out %>%
mutate(interaction = map(UnloadVars, ~ .$interaction)) %>%
mutate(interaction = map(interaction, ~ unpack_list(.x, key))) %>%
unnest(interaction)
df_interaction
解决方案不是很优雅,但可以完成工作。您可以应用相同的逻辑从可见性中提取信息。