我知道这个主题已被广泛覆盖,但我没有找到适合我案例的代码......我有一个这种类型的数据框:
V1 V2 V3
1: label1 alias_fr alias_fr
2: label1 triplet triplet
3: label1 Q9327 Q3122270
4: label2 NULL NULL
5: label3 alias_fr NULL
6: label3 triplet NULL
7: label3 Q678 NULL
在将json输出映射到来自df输入的查询后生成此数据帧:
df <- Map(rbind, originalDF$input,out) #I first used Map(c,..) but it seems to be more difficult to reshape than rbind
df <- rbind.fill(lapply(df,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
class(df)
[1] "data.frame"
虽然简化了给出的示例,因为我有超过3列,有些值是列表。然而,当我有一个标签的非NULL值时,我在列中总是有相同的行数(在我的例子中为3:alias_fr,triplet,Qxx)。
我希望每个V1值都有V2和V3值:
V1 var1 var2 var3
label1 alias_fr triplet Q9327
label1 alias_fr triplet Q3122270
label2 NULL NULL NULL
label3 alias_fr triplet Q678
我尝试开始融化:melt(df,id="V1")
,但后来我被卡住了。
我也尝试过重塑,演员,dcast,没有任何成功,而且我越来越混淆所有重塑的东西......如果一个重塑大师,我会非常感激;)
[编辑] :真实对象以澄清我的问题
Ok so this is an extract of the real dataset I’m working with:
#original dataset (actually it’s one column of the dataset)
originalDF <- c("Guy de Maupassant", "J.-J. Goldman", "Poitou-Charentes")
#output of the API query from the text in the orginalDF
out <- list(structure(list(`_index` = c("alias_fr", "alias_fr"), `_type` = c("triplet",
"triplet"), `_id` = c("Q9327", "Q3122270"), `_score` = c(NA,
NA), sort = list(-4.95263021255079, -6.65910164747673), `_source.types` = list(
structure(list(id = c("Q5", "dbPedia.Person"), value = c("être humain",
"personne")), .Names = c("id", "value"), class = "data.frame", row.names = 1:2),
structure(list(id = c("Q11424", "dbPedia.Film"), value = c("film",
"film")), .Names = c("id", "value"), class = "data.frame", row.names = 1:2)),
`_source.pageRank` = c(-4.95263021255079, -6.65910164747673
), `_source.subTypes` = list(structure(list(id = c("Q1930187",
"Q36180", "Q15949613", "Q6625963", "Q214917"), value = c("journaliste",
"écrivain", "nouvelliste", "romancier", "dramaturge")), .Names = c("id",
"value"), class = "data.frame", row.names = c(NA, 5L)), NULL),
`_source.label` = c("Guy de Maupassant", "Guy de Maupassant"
), `_source.id` = c("Q9327", "Q3122270")), .Names = c("_index",
"_type", "_id", "_score", "sort", "_source.types", "_source.pageRank",
"_source.subTypes", "_source.label", "_source.id"), class = "data.frame", row.names = 1:2),
list(), structure(list(`_index` = "alias_fr", `_type` = "triplet",
`_id` = "Q17009", `_score` = NA, sort = list(-5.0448283638424),
`_source.types` = list(structure(list(id = "Q22670030",
value = "ancienne région française"), .Names = c("id",
"value"), class = "data.frame", row.names = 1L)), `_source.pageRank` = -5.0448283638424,
`_source.label` = "Poitou-Charentes", `_source.id` = "Q17009"), .Names = c("_index",
"_type", "_id", "_score", "sort", "_source.types", "_source.pageRank",
"_source.label", "_source.id"), class = "data.frame", row.names = 1L))
#df object (generated from Map, then rbind.fill)
df <- structure(list(V1 = list("Guy de Maupassant", "Guy de Maupassant",
"Guy de Maupassant", "Guy de Maupassant", "Guy de Maupassant",
"Guy de Maupassant", "Guy de Maupassant", "Guy de Maupassant",
"Guy de Maupassant", "Guy de Maupassant", "J.-J. Goldman",
"Poitou-Charentes", "Poitou-Charentes", "Poitou-Charentes",
"Poitou-Charentes", "Poitou-Charentes", "Poitou-Charentes",
"Poitou-Charentes", "Poitou-Charentes", "Poitou-Charentes"),
V2 = list("alias_fr", "triplet", "Q9327", NA_character_,
-4.95263021255079, structure(list(id = c("Q5", "dbPedia.Person"
), value = c("être humain", "personne")), .Names = c("id",
"value"), class = "data.frame", row.names = 1:2), "-4.95263021255079",
structure(list(id = c("Q1930187", "Q36180", "Q15949613",
"Q6625963", "Q214917"), value = c("journaliste", "écrivain",
"nouvelliste", "romancier", "dramaturge")), .Names = c("id",
"value"), class = "data.frame", row.names = c(NA, 5L)),
"Guy de Maupassant", "Q9327", NULL, "alias_fr", "triplet",
"Q17009", NA_character_, -5.0448283638424, structure(list(
id = "Q22670030", value = "ancienne région française"), .Names = c("id",
"value"), class = "data.frame", row.names = 1L), "-5.0448283638424",
"Poitou-Charentes", "Q17009"), V3 = list("alias_fr",
"triplet", "Q3122270", NA_character_, -6.65910164747673,
structure(list(id = c("Q11424", "dbPedia.Film"), value = c("film",
"film")), .Names = c("id", "value"), class = "data.frame", row.names = 1:2),
"-6.65910164747673", NULL, "Guy de Maupassant", "Q3122270",
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL), V4 = list(NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL)), .Names = c("V1", "V2",
"V3", "V4"), row.names = c(NA, 20L), class = "data.frame")
答案 0 :(得分:0)
对于您的特定示例(每个标签最多包含三行),一种方法是
require(dplyr)
df <- data.frame(label = c(rep('a',3),'b', rep('c',3)), id1 = c(1,2,3,NA, 1,2,4), id2 = c(1,2,5,NA,NA,NA,NA))
#I used different names than you, because I have the impression that each column belongs to another measurement or so
df1 <- select(df, label, id1)#subsets your df into two data frames
df2 <- select(df, label, id2)
df1a <- df1 %>% group_by(label) %>% summarise (var1 = id1[1], var2 = id1[2], var3 = id1[3])
df2b <- df2 %>% group_by(label) %>% summarise (var1 = id2[1], var2 = id2[2], var3 = id2[3])
#this groups the columns after your label and then you can force the creation of NA values for the observations that do not have a row in your df
df_final <- rbind(df1a,df2b)
> df_final
# A tibble: 6 x 4
label var1 var2 var3
<fctr> <dbl> <dbl> <dbl>
1 a 1 2 3
2 b NA NA NA
3 c 1 2 4
4 a 1 2 5
5 b NA NA NA
6 c NA NA NA
我知道这不是优雅的,也不是一般性的,因为你手动为你在df中拥有/没有的每一行分配一个新列,但它应该适用于你的例子。