我正在使用RStudio 2.15.0并使用XLConnect创建了一个对象,其中包含3000多行和12列我试图将列分隔/拆分成行,但不知道这是否可行或如何做到这一点。使用3列连接的下面数据示例。任何有关这方面的帮助都会很棒。
适用于其中两列的代码如下。
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
示例数据
PolId Description Document.Type
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
AAA123; TEST1 End1
AAA123; TEST2 End2
ABB123;ABC123; TEST3 End1
ABB123;ABC123; TEST3 End1
我希望输出像这样(替换重复的Polid)
PolId Description Document.Type
ABC123 TEST1 Pol1
ABC456 TEST1 Pol1
ABC789 TEST1 Pol1
AAA123 TEST1 End1
AAA123 TEST2 End2
ABB123 TEST3 End1
ABC123 TEST3 End1
答案 0 :(得分:7)
这是基础R解决方案。使用PolId
拆分strplit
字段,并为每个此类拆分字段cbind与相应的Description
。这给出了我们rbind
在一起的矩阵列表。最后设置列名。
out <- do.call(rbind, Map(cbind, strsplit(DF$PolId, ";"), DF$Description))
colnames(out) <- colnames(DF)
,并提供:
> out
PolId Description
[1,] "ABC123" "TEST1"
[2,] "ABC456" "TEST1"
[3,] "ABC789" "TEST1"
[4,] "ABC123" "TEST1"
[5,] "ABC456" "TEST1"
[6,] "ABC789" "TEST1"
[7,] "ABC123" "TEST1"
[8,] "ABC456" "TEST1"
[9,] "ABC789" "TEST1"
[10,] "AAA123" "TEST1"
[11,] "AAA123" "TEST2"
[12,] "ABB123" "TEST3"
[13,] "ABC123" "TEST3"
[14,] "ABB123" "TEST3"
[15,] "ABC123" "TEST3"
注意:我们将此作为输入:
DF <-
structure(list(PolId = c("ABC123;ABC456;ABC789;", "ABC123;ABC456;ABC789;",
"ABC123;ABC456;ABC789;", "AAA123;", "AAA123;", "ABB123;ABC123;",
"ABB123;ABC123;"), Description = c("TEST1", "TEST1", "TEST1",
"TEST1", "TEST2", "TEST3", "TEST3")), .Names = c("PolId", "Description"
), class = "data.frame", row.names = c(NA, -7L))
答案 1 :(得分:5)
这是一个快速data.table
可能的解决方案
library(data.table)
unique(setDT(df)[, .(PolId = unlist(strsplit(as.character(PolId), ";"))), by = Description])
# Description PolId
# 1: TEST1 ABC123
# 2: TEST1 ABC456
# 3: TEST1 ABC789
# 4: TEST1 AAA123
# 5: TEST2 AAA123
# 6: TEST3 ABB123
# 7: TEST3 ABC123
根据您的编辑 - 另一个选项(如果您有两列以上)
library(splitstackshape)
unique(cSplit(df, "PolId", ";", "long"))
# PolId Description Document.Type
# 1: ABC123 TEST1 Pol1
# 2: ABC456 TEST1 Pol1
# 3: ABC789 TEST1 Pol1
# 4: AAA123 TEST1 End1
# 5: AAA123 TEST2 End2
# 6: ABB123 TEST3 End1
# 7: ABC123 TEST3 End1
答案 2 :(得分:3)
您可以在分割&#34; PolId&#34;后从unnest
尝试tidyr
列并获取unique
行
library(dplyr)
library(tidyr)
unnest(setNames(strsplit(df$PolId, ';'), df$Description),
Description) %>% unique()
或base R
与stack/strsplit/duplicated
一起使用。拆分&#34; PolId&#34; (strsplit
)通过分隔符(;
),将输出列表元素命名为&#34;描述&#34;列,stack
列表,以获取&#39; data.frame&#39;并使用duplicated
删除重复的行。
df1 <- stack(setNames(strsplit(df$PolId, ';'), df$Description))
setNames(df1[!duplicated(df1),], names(df))
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#10 AAA123 TEST1
#11 AAA123 TEST2
#12 ABB123 TEST3
#13 ABC123 TEST3
或不使用strsplit
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#4 AAA123 TEST1
#5 AAA123 TEST2
#6 ABB123 TEST3
#7 ABC123 TEST3