我已经搜索过并找到了许多解决方案,但最终从未完成过。对于有经验的人来说,这可能非常简单......
以下是我的数据片段。这是由包jsonlite从JSON导入自动创建的。数据结构非常好,但我无能为力。 Update2:我在下面添加了相关数据
structure(list(rightsize = c(42L, 50L, 52L, 49L, 41L, 41L, 41L,
41L, 41L, 45L, 47L, 42L, 45L, 46L, 42L, 44L, 44L, 37L, 44L, 41L
), hitlen = c("", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", ""), linegroup = c("_", "_", "_",
"_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_",
"_", "_", "_", "_"), leftsize = c(46L, 43L, 43L, 37L, 49L, 43L,
43L, 45L, 45L, 43L, 44L, 46L, 45L, 46L, 44L, 43L, 54L, 45L, 51L,
47L), leftspace = c(" ", " ", " ",
" ", " ", " ", " ", " ",
" ", " ", " ", " ", " ",
" ", " ", " ", "", " ", " ",
" "), Left = list(structure(list(class = c("", "coll",
""), str = c("patients with ", "chronic", " obstructive pulmonary"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("respect to ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("While there is no cure for this ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "strc", "", "coll", ""), str = c(".",
"</p><p>", "When patients with ", "chronic", " liver")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c("bronchitis , and ", "chronic",
" obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("offers the possibility that ",
"chronic", " lung")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" , such as ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("always as clear in other ",
"chronic", " incurable")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("may have the potential to prevent ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" half the estimated cost of all ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("is consistent with the tact that ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("used to treat ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("ingredient for dietary therapy of ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("patients with ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("greater for ", "chronic",
" obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" departments , with schemes for ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("postponement of death by means of managing ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("certainly be ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("cardiovascular disease , cancer , other ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("terminal illnesses are converted to ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L))), Right = list(structure(list(class = "", str = " who may be at risk of developing steroid"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " - plausibly related to exposure to environmental"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " , it can be treated , Black says . Antidepressants"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " ask what they can do to improve their condition"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " [ COPD ] ) was 15 % ( estimated within "), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " is part of the continuum of development"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " ( 70 , 71 ) and sleep apnea . Elevation"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " . Patients with heart failure highlight"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " other than heart disease , and helps us"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " in this country . Furthermore , the portion"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " are multigenic and multifactorial . Therefore"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " . Nasal corticosteroids are increasingly"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " such as diabetes mellitus or hyperlipidemia"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " ( COPD ) concluded exercise relieves dyspnea"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " than for any other disease. 5 The number"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " management in patients with COPD receiving"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " and disability is costly , and it is bound"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "strc", ""), str = c(" .", "</p><p>", "Much rarer condition , but people"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = " , and in fact those rates have been rising"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " . The panel 's report is negative about"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), toknum = c(580661252L,
585871494L, 572902309L, 596182644L, 611091300L, 604962106L, 605346237L,
585102838L, 575701411L, 616556239L, 548908661L, 604489309L, 548601059L,
617460845L, 585870185L, 591049175L, 581965276L, 592616458L, 592591831L,
599295354L), rightspace = c(" ", " ", "", " ", " ",
" ", " ", " ", " ", " ",
" ", " ", " ", " ", " ", " ",
" ", " ", " ", " "), Tbl_refs = list(
"11.99.0023.006", "11.99.0031.001", "11.99.0012.004", "11.99.0046.013",
"11.99.0069.003", "11.99.0059.007", "11.99.0060.003", "11.99.0030.001",
"11.99.0016.007", "11.99.0077.021", "11.01.0003.015", "11.99.0059.003",
"11.01.0003.006", "11.99.0078.034", "11.99.0031.001", "11.99.0038.005",
"11.99.0025.005", "11.99.0040.006", "11.99.0040.006", "11.99.0051.011"),
ref = c("11.99.0023.006", "11.99.0031.001", "11.99.0012.004",
"11.99.0046.013", "11.99.0069.003", "11.99.0059.007", "11.99.0060.003",
"11.99.0030.001", "11.99.0016.007", "11.99.0077.021", "11.01.0003.015",
"11.99.0059.003", "11.01.0003.006", "11.99.0078.034", "11.99.0031.001",
"11.99.0038.005", "11.99.0025.005", "11.99.0040.006", "11.99.0040.006",
"11.99.0051.011")), .Names = c("rightsize", "hitlen", "linegroup",
"leftsize", "leftspace", "Left", "Right", "Kwic", "toknum", "rightspace",
"Tbl_refs", "ref"), class = "data.frame", row.names = c(NA, 20L
))
我需要做的是1)转换这4个数据帧并将“class”中的值指定为列标题。注意,#1,列数可能不同。另请注意(#2)某些列名称将为“”。因此,the wonderful solution here导致数据帧中的某些列标题都被垃圾填充,使得下一步(数据帧合并)成为不可能,例如,
(垃圾填充标题似乎是“”,超出了第一个。)
在该步骤之后,我需要合并这些数据帧,同时考虑缺失值。 Rbind.fill可以解决问题,但只有在数据足够统一的情况下才能实现。我搜索了high&amp; low找到解决方案,但尚未找到足以解决此问题的解决方案。
更新:我继续尝试熔化/铸造。以下内容非常接近可接受的最终解决方案:
require(reshape2)
docx <- melt(documentdata$Left, id.vars = c("class"))
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
唯一的问题是,如上所述,空白的“类”导致结构在dcast时丢失:所有未命名的列最终合并并且无序,例如
L1 variable Var.3 coll strc
1 1 str patients with , obstructive pulmonary chronic
2 2 str respect to , obstructive pulmonary chronic
3 3 str While there is no cure for this , chronic
4 4 str ., When patients with , liver chronic </p><p>
5 5 str bronchitis , and , obstructive pulmonary chronic
og数据中的关键“类”是变量“coll”,它始终至少有一个空白,之后一个空白。一个解决方案可能是在dcast之前创建名称“pre-coll”和“post-coll”?
更新#3:这是一个可能的,虽然丑陋的解决方案。任何“更清洁”的选择?
require(reshape2)
docx <- melt(documentdata$Left, id.vars = c("class"))
pre <- which(docx$class %in% c("coll")) - 1
post <- which(docx$class %in% c("coll")) + 1
docx$class[pre] = "l.pre"
docx$class[post] = "l.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
docx.left <- docx[, c("l.pre", "coll", "l.post")]
提前感谢您的帮助。
答案 0 :(得分:3)
让我们用dplyr
:
library(dplyr)
documentdata$Left %>% do.call(rbind, .) %>%
do(data.frame(pre = .[["str"]][which(.[["class"]]=="coll")-1],
coll = .[["str"]][which(.[["class"]]=="coll")],
post = .[["str"]][which(.[["class"]]=="coll")+1]))
pre coll post
1 patients with chronic obstructive pulmonary
2 respect to chronic obstructive pulmonary
3 While there is no cure for this chronic
4 When patients with chronic liver
5 bronchitis , and chronic obstructive pulmonary
6 offers the possibility that chronic lung
....
18 certainly be chronic obstructive pulmonary
19 cardiovascular disease , cancer , other chronic
20 terminal illnesses are converted to chronic
编辑:解释:
dplyr
有一种奇怪的语法。请参阅dplyr vignette或data wrangling cheat sheet。 %>%
是来自magrittr
包的管道,如果函数在右边,只需将管道左侧的所有内容输出作为第一个参数:
5 %>% c(1)
#same as
c(5, 1)
如果您想在其他地方使用它,可以使用.
来表示左侧的内容。如果您愿意,可以对.
进行子集化(例如.[["str"]]
):
5 %>% c(1, .)
#same as
c(1, 5)
do
允许我们进行任何我们想要的计算,而不用担心标准的dplyr
动词 - 它是一个包装器。请参阅?do
。
所以答案采用documentdata$Left
,将其管道化为do.call(rbind, .)
,这会折叠列表(到目前为止,这与do.call(rbind, documentdata$Left)
相同)。我们将其传输到do
,该.
创建了一个新数据框,其中包含从{{1}}中选择的相关列。