这是我的数据集的一个示例:
ID = c(1, 2, 3, 4)
Allegation = c("A::B::C::V", "A::C", "A::D", "D::E::D")
Disposition = c("Open::Closed::Open", "Closed::Closed", "Open::Open", "Closed::Open")
df <- data.frame(ID,Allegation, Disposition)
ID Allegation Disposition
1 A::B::C::V Open::Closed::Open
2 A::C Closed::Closed
3 A::D Open::Open
4 D::E::D Closed::Open
我想要以下结果:
ID Allegation Disposition Allegation_detail Dispostion_detail
1 A::B::C::V Open::Closed::Open A Open
1 A::B::C::V Open::Closed::Open B Closed
1 A::B::C::V Open::Closed::Open C Open
1 A::B::C::V Open::Closed::Open V NA
2 A::C Closed::Closed A Closed
我试图融化数据然后将其合并,但我没有获得所需的输出
到目前为止,这是我的方法:
#Create column to see num of allegations
df$num_allegations <- (str_count(as.character(df$Allegation), "::") +1)
#Looking max allegations
max(df$num_allegations)
#Expanding allegations
df$Allegation1 <- sapply(strsplit(as.character(df$Allegation), "::", fixed= TRUE), `[`, 1)
df$Allegation2 <- sapply(strsplit(as.character(df$Allegation), "::", fixed= TRUE), `[`, 2)
df$Allegation3 <- sapply(strsplit(as.character(df$Allegation), "::", fixed= TRUE), `[`, 3)
df$Allegation4 <- sapply(strsplit(as.character(df$Allegation), "::", fixed= TRUE), `[`, 4)
#Expanding Disposition
df$Disposition1 <- sapply(strsplit(as.character(df$Disposition), "::", fixed= TRUE), `[`, 1)
df$Disposition2 <- sapply(strsplit(as.character(df$Disposition), "::", fixed= TRUE), `[`, 2)
df$Disposition3 <- sapply(strsplit(as.character(df$Disposition), "::", fixed= TRUE), `[`, 3)
df$Disposition4 <- sapply(strsplit(as.character(df$Disposition), "::", fixed= TRUE), `[`, 4)
#melting data
dfmelt1 <- melt(df[,c(1:8)], id=c("ID", "Allegation", "Disposition", "num_allegations"))
dfmelt2 <- melt(df[,c(1,2,3,4,9,10,11,12)], id=c("ID", "Allegation", "Disposition", "num_allegations"))
colnames(dfmelt2) <- c("ID" ,"Allegation" ,"Disposition","num_allegations", "variable2",
"value2")
但是当我合并数据时,我得到了这个结果,这不是我想要的结果:
merge(dfmelt1, dfmelt2, by = c("ID", "Allegation", "Disposition", "num_allegations"))
ID Allegation Disposition num_allegations variable value variable2 value2
1 A::B::C::V Open::Closed::Open 4 Allegation1 A Disposition1 Open
1 A::B::C::V Open::Closed::Open 4 Allegation1 A Disposition2 Closed
1 A::B::C::V Open::Closed::Open 4 Allegation1 A Disposition3 Open
1 A::B::C::V Open::Closed::Open 4 Allegation1 A Disposition4 <NA>
1 A::B::C::V Open::Closed::Open 4 Allegation2 B Disposition1 Open
1 A::B::C::V Open::Closed::Open 4 Allegation2 B Disposition2 Closed
1 A::B::C::V Open::Closed::Open 4 Allegation2 B Disposition3 Open
1 A::B::C::V Open::Closed::Open 4 Allegation2 B Disposition4 <NA>
1 A::B::C::V Open::Closed::Open 4 Allegation3 C Disposition1 Open
1 A::B::C::V Open::Closed::Open 4 Allegation3 C Disposition2 Closed
1 A::B::C::V Open::Closed::Open 4 Allegation3 C Disposition3 Open
1 A::B::C::V Open::Closed::Open 4 Allegation3 C Disposition4 <NA>
1 A::B::C::V Open::Closed::Open 4 Allegation4 V Disposition1 Open
1 A::B::C::V Open::Closed::Open 4 Allegation4 V Disposition2 Closed
1 A::B::C::V Open::Closed::Open 4 Allegation4 V Disposition3 Open
1 A::B::C::V Open::Closed::Open 4 Allegation4 V Disposition4 <NA>
2 A::C Closed::Closed 2 Allegation1 A Disposition1 Closed
我如何合并,所以我获得了处置1,只有在说明Allegation 1的地方?
由于
答案 0 :(得分:0)
这是一个想法,
#get a vector with repeats for expanding the data.frame
ind <- stringr::str_count(df$Allegation, '\\w+')
new_df <- df[rep(row.names(df), ind),]
#create vector with allegation details
v1 <- do.call(rbind, sapply(strsplit(as.character(df$Allegation), '::'), function(i)
t(as.data.frame(t(i)))))
#create vector with Disposition details
v2 <- do.call(rbind, sapply(strsplit(as.character(df$Disposition), '::'), function(i)
t(as.data.frame(t(i)))))
v2 <- v2[match(make.unique(rownames(v1)), make.unique(rownames(v2)))]
#construct final data frame
final_df <- data.frame(new_df, Allegation_detail=v1, Disposition_detail=v2,
stringsAsFactors = FALSE, row.names = NULL)
final_df
# ID Allegation Disposition Allegation_detail Disposition_detail
#1 1 A::B::C::V Open::Closed::Open A Open
#2 1 A::B::C::V Open::Closed::Open B Closed
#3 1 A::B::C::V Open::Closed::Open C Open
#4 1 A::B::C::V Open::Closed::Open V <NA>
#5 2 A::C Closed::Closed A Closed
#6 2 A::C Closed::Closed C Closed
#7 3 A::D Open::Open A Open
#8 3 A::D Open::Open D Open
#9 4 D::E::D Closed::Open D Closed
#10 4 D::E::D Closed::Open E Open
#11 4 D::E::D Closed::Open D <NA>
答案 1 :(得分:0)
这是一个使用data.table的解决方案,但逻辑上它类似于你的算法
library(data.table)
library(stringi)
setDT(df)
splitter <- function(x) as.vector(stri_list2matrix(stri_split_fixed(x, "::")))
#find the max parts for padding NA at the end
#http://stackoverflow.com/questions/17804389/pad-each-element-in-a-list-to-specific-length-in-r
df[, Len:=max(lengths(lapply(.SD, splitter))), by="ID"]
#split using ::
parsedDF <- df[, lapply(.SD, function(x) {
ans <- splitter(x)
length(ans) <- Len
ans
}), by="ID"][,
Len:=NULL]
setnames(parsedDF, names(parsedDF), paste0(names(parsedDF),"_detail"))
#join back with original data.table
df[parsedDF, on=c("ID"="ID_detail")][,
Len:=NULL]
## ID Allegation Disposition Allegation_detail Disposition_detail
## 1: 1 A::B::C::V Open::Closed::Open A Open
## 2: 1 A::B::C::V Open::Closed::Open B Closed
## 3: 1 A::B::C::V Open::Closed::Open C Open
## 4: 1 A::B::C::V Open::Closed::Open V NA
## 5: 2 A::C Closed::Closed A Closed
## 6: 2 A::C Closed::Closed C Closed
## 7: 3 A::D Open::Open A Open
## 8: 3 A::D Open::Open D Open
## 9: 4 D::E::D Closed::Open D Closed
## 10: 4 D::E::D Closed::Open E Open
## 11: 4 D::E::D Closed::Open D NA