我有一个数据框。我在叫他麻烦
> head(trouble)
ID Category
1 1.NA.NA.NA.NA A
2 1.1.NA.NA.TOTAL Total under A
3 1.1.1.NA.TOTAL Of Which in 1s period
4 14.NA.NA.NA.NA B
5 14.1.NA.NA.TOTAL No as B1
6 14.10.NA.NA.NA And B2
我想使用隐藏在故障$ ID(变量)中的分层信息。 仔细看看!
> head(look[,c("ID.1", "Category", "Group")],6)
ID.1 Category Group
1 1.NA.NA.NA A <NA>
2 1.1.NA.NA Total under A TOTAL
3 1.1.1.NA Of Which in 1s period TOTAL
4 14.NA.NA.NA B <NA>
5 14.1.NA.NA No as B1 TOTAL
6 14.10.NA.NA And B2 <NA>
以上是最初的麻烦$ ID在最后一个分隔符(“。”)上被分开并重命名为ID&amp;基。
现在,我可以手动浏览trouble
列,将其转换为以下内容:
ID CategoryI CategoryII CategoryIII Group
1.NA.NA.NA A <NA> <NA> <NA>
1.1.NA.NA A Total under A <NA> TOTAL
1.1.1.NA A Total under A Of Which in 1s period TOTAL
So my question is:
我该如何自动执行此操作?
Samples:
以下是访问示例trouble
&amp;的链接。他的output
PS:这不仅仅是将单个列拆分为多个列。请不要混淆。
答案 0 :(得分:0)
library(magrittr)
trouble <- read.table(text="ID Category
1 1.NA.NA.NA.NA A
2 1.1.NA.NA.TOTAL 'Total under A'
3 1.1.1.NA.TOTAL 'Of Which in 1s period'
4 14.NA.NA.NA.NA B
5 14.1.NA.NA.TOTAL 'No as B1'
6 14.10.NA.NA.NA 'And B2'",stringsAsFactors = FALSE,header=TRUE)
look <-
trouble$ID %>%
strsplit("\\.") %>%
lapply(function(x){c(paste(x[1:4],collapse="."),x[5])}) %>%
do.call(rbind,.) %>%
as.data.frame %>%
setNames(c("ID.1","Group")) %>%
cbind(trouble,.)
# ID Category ID.1 Group
# 1 1.NA.NA.NA.NA A 1.NA.NA.NA NA
# 2 1.1.NA.NA.TOTAL Total under A 1.1.NA.NA TOTAL
# 3 1.1.1.NA.TOTAL Of Which in 1s period 1.1.1.NA TOTAL
# 4 14.NA.NA.NA.NA B 14.NA.NA.NA NA
# 5 14.1.NA.NA.TOTAL No as B1 14.1.NA.NA TOTAL
# 6 14.10.NA.NA.NA And B2 14.10.NA.NA NA
get_3_cat <- function(v){c(v[1],paste(v[1:2],collapse="."),paste(v[1:3],collapse="."))}
look_and_codes <- look[,1] %>%
strsplit("\\.") %>%
lapply(get_3_cat) %>%
do.call(rbind,.) %>%
as.data.frame %>%
setNames(paste0("code",1:3)) %>%
cbind(look,.)
look_and_codes$IDclean <- gsub("\\.NA","",look_and_codes$ID.1)
# ID Category ID.1 Group code1 code2 code3 IDclean
# 1 1.NA.NA.NA.NA A 1.NA.NA.NA NA 1 1.NA 1.NA.NA 1
# 2 1.1.NA.NA.TOTAL Total under A 1.1.NA.NA TOTAL 1 1.1 1.1.NA 1.1
# 3 1.1.1.NA.TOTAL Of Which in 1s period 1.1.1.NA TOTAL 1 1.1 1.1.1 1.1.1
# 4 14.NA.NA.NA.NA B 14.NA.NA.NA NA 14 14.NA 14.NA.NA 14
# 5 14.1.NA.NA.TOTAL No as B1 14.1.NA.NA TOTAL 14 14.1 14.1.NA 14.1
# 6 14.10.NA.NA.NA And B2 14.10.NA.NA NA 14 14.10 14.10.NA 14.10
output <- look_and_codes %>% merge(look_and_codes[,c("IDclean","Category")]
%>% setNames(c("code1","CategoryI")) %>% unique,all.x=TRUE) %>%
merge(look_and_codes[,c("IDclean","Category")] %>% setNames(c("code2","CategoryII")) %>% unique,all.x=TRUE) %>%
merge(look_and_codes[,c("IDclean","Category")] %>% setNames(c("code3","CategoryIII")) %>% unique,all.x=TRUE)
# code3 code2 code1 ID Category ID.1 Group IDclean CategoryI CategoryII CategoryIII
# 1 1.1.1 1.1 1 1.1.1.NA.TOTAL Of Which in 1s period 1.1.1.NA TOTAL 1.1.1 A Total under A Of Which in 1s period
# 2 1.1.NA 1.1 1 1.1.NA.NA.TOTAL Total under A 1.1.NA.NA TOTAL 1.1 A Total under A <NA>
# 3 1.NA.NA 1.NA 1 1.NA.NA.NA.NA A 1.NA.NA.NA NA 1 A <NA> <NA>
# 4 14.1.NA 14.1 14 14.1.NA.NA.TOTAL No as B1 14.1.NA.NA TOTAL 14.1 B No as B1 <NA>
# 5 14.10.NA 14.10 14 14.10.NA.NA.NA And B2 14.10.NA.NA NA 14.10 B And B2 <NA>
# 6 14.NA.NA 14.NA 14 14.NA.NA.NA.NA B 14.NA.NA.NA NA 14 B <NA> <NA>
clean_output <- output[,c("ID.1","CategoryI","CategoryII","CategoryIII","Group")]
clean_output <- clean_output[match(clean_output$ID.1,look_and_codes$ID.1),]
# ID.1 CategoryI CategoryII CategoryIII Group
# 3 1.NA.NA.NA A <NA> <NA> NA
# 2 1.1.NA.NA A Total under A <NA> TOTAL
# 1 1.1.1.NA A Total under A Of Which in 1s period TOTAL
# 5 14.10.NA.NA B And B2 <NA> NA
# 6 14.NA.NA.NA B <NA> <NA> NA
# 4 14.1.NA.NA B No as B1 <NA> TOTAL
答案 1 :(得分:0)
这是一个具有挑战性的问题。下面的解决方案使用zoo::na.locf()
(最后一次观察结转),并在使用data.table
melt()
从宽到长格式重新整理数据后进行分组。
解决方案应该很容易适应任意数量的列。只有两个地方Category
列的数量是硬编码的。
library(data.table) # CRAN version 1.10.4 used
# define column names
Cats <- paste0("Cat", 1:4)
# create new columns by splitting ID
setDT(trouble)[, (c(Cats, "Group")) := tstrsplit(ID, ".", fixed = TRUE)]
# amend ID as requested by OP:
# remove Group part from ID, keep only first 4 parts
trouble[, ID := stringr::str_extract(ID, "^(\\w+[.]){3}\\w+")]
# add row number
trouble[, rn := .I]
# reshape from wide to long
long <- melt(trouble, measure.vars = c(Cats, "Group"))
# replace "NA"
long[value == "NA", value := NA]
# find level of each row
long[variable %in% Cats & !is.na(value), level := last(variable), rn]
# create new category column, fill with known values
long[variable == level, new := Category]
long[variable == "Group", new := value]
# fill remaining NAs where appropriate, keep NAs at begin of each group
long[order(variable, rn), new := zoo::na.locf(new, na.rm = FALSE),
.(variable, rleid(value))]
# reshape from long to wide
result <- dcast(long, rn + ID ~ variable, value.var = "new")[, rn := NULL][]
result
返回:
ID Cat1 Cat2 Cat3 Cat4 Group
1: 1.NA.NA.NA A NA NA NA NA
2: 1.1.NA.NA A Total under A NA NA TOTAL
3: 1.1.1.NA A Total under A Of Which in 1s period NA TOTAL
4: 14.NA.NA.NA B NA NA NA NA
5: 14.1.NA.NA B No as B1 NA NA TOTAL
6: 14.10.NA.NA B And B2 NA NA NA
7: 14.10.1.NA B And B2 Then B2.1 NA NA
8: 14.10.1.1 B And B2 Then B2.1 Male Children
9: 14.10.1.1 B And B2 Then B2.1 Male Adults
10: 14.10.1.2 B And B2 Then B2.1 Female Children
11: 14.10.1.2 B And B2 Then B2.1 Female Adults
12: 14.10.1.3 B And B2 Then B2.1 Total {(9) to (12)} Children
13: 14.10.1.3 B And B2 Then B2.1 Total {(9) to (12)} Adults
14: 16.NA.NA.NA Month Positon NA NA NA NA
15: 16.1.NA.NA Month Positon Group1 NA NA NA
16: 16.1.1.NA Month Positon Group1 Group1 A NA Balance From Previous Month
17: 16.1.1.NA Month Positon Group1 Group1 A NA Stocks Received
18: 16.1.1.NA Month Positon Group1 Group1 A NA Unusable Stock
19: 16.1.1.NA Month Positon Group1 Group1 A NA Stock Distributed
20: 16.1.1.NA Month Positon Group1 Group1 A NA Total Stock
21: 16.1.2.NA Month Positon Group1 Group1 B NA Balance From Previous Month
22: 16.1.2.NA Month Positon Group1 Group1 B NA Stocks Received
23: 16.1.2.NA Month Positon Group1 Group1 B NA Unusable Stock
24: 16.1.2.NA Month Positon Group1 Group1 B NA Stock Distributed
25: 16.1.2.NA Month Positon Group1 Group1 B NA Total Stock
ID Cat1 Cat2 Cat3 Cat4 Group
OP通过下载链接提供了输入数据和预期结果。
trouble <- structure(list(ID = c("1.NA.NA.NA.NA", "1.1.NA.NA.TOTAL", "1.1.1.NA.TOTAL",
"14.NA.NA.NA.NA", "14.1.NA.NA.TOTAL", "14.10.NA.NA.NA", "14.10.1.NA.NA",
"14.10.1.1.Children", "14.10.1.1.Adults", "14.10.1.2.Children",
"14.10.1.2.Adults", "14.10.1.3.Children", "14.10.1.3.Adults",
"16.NA.NA.NA.NA", "16.1.NA.NA.NA", "16.1.1.NA.Balance From Previous Month",
"16.1.1.NA.Stocks Received", "16.1.1.NA.Unusable Stock", "16.1.1.NA.Stock Distributed",
"16.1.1.NA.Total Stock", "16.1.2.NA.Balance From Previous Month",
"16.1.2.NA.Stocks Received", "16.1.2.NA.Unusable Stock", "16.1.2.NA.Stock Distributed",
"16.1.2.NA.Total Stock"), Category = c("A", "Total under A",
"Of Which in 1s period", "B", "No as B1", "And B2", "Then B2.1",
"Male", "Male", "Female", "Female", "Total {(9) to (12)}", "Total {(9) to (12)}",
"Month Positon", "Group1", "Group1 A", "Group1 A", "Group1 A",
"Group1 A", "Group1 A", "Group1 B", "Group1 B", "Group1 B", "Group1 B",
"Group1 B")), .Names = c("ID", "Category"), row.names = c(NA,
-25L), class = "data.frame")
output <- structure(list(ID = c("1.NA.NA.NA", "1.1.NA.NA", "1.1.1.NA",
"14.NA.NA.NA", "14.1.NA.NA", "14.10.NA.NA", "14.10.1.NA", "14.10.1.1",
"14.10.1.1", "14.10.1.2", "14.10.1.2", "14.10.1.3", "14.10.1.3",
"16.NA.NA.NA", "16.1.NA.NA", "16.1.1.NA", "16.1.1.NA", "16.1.1.NA",
"16.1.1.NA", "16.1.1.NA", "16.1.2.NA", "16.1.2.NA", "16.1.2.NA",
"16.1.2.NA", "16.1.2.NA"), CategoryI = c("A", "A", "A", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "Month Positon",
"Month Positon", "Month Positon", "Month Positon", "Month Positon",
"Month Positon", "Month Positon", "Month Positon", "Month Positon",
"Month Positon", "Month Positon", "Month Positon"), CategoryII = c(NA,
"Total under A", "Total under A", NA, "No as B1", "And B2", "And B2",
"And B2", "And B2", "And B2", "And B2", "And B2", "And B2", NA,
"Group1", "Group1", "Group1", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1"), CategoryIII = c(NA,
NA, NA, NA, NA, NA, "Then B2.1", "Then B2.1", "Then B2.1", "Then B2.1",
"Then B2.1", "Then B2.1", "Then B2.1", NA, NA, "Group1 A", "Group1 A",
"Group1 A", "Group1 A", "Group1 A", "Group1 B", "Group1 B", "Group1 B",
"Group1 B", "Group1 B"), CategoryIV = c(NA, NA, NA, NA, NA, NA,
NA, "Male", "Male", "Female", "Female", "Total {(9) to (12)}",
"Total {(9) to (12)}", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Group = c(NA, "TOTAL", "TOTAL", NA, "TOTAL", NA, NA,
"Children", "Adults", "Children", "Adults", "Children", "Adults",
NA, NA, "Balance From Previous Month", "Stocks Received", "Unusable Stock",
"Stock Distributed", "Total Stock", "Balance From Previous Month",
"Stocks Received", "Unusable Stock", "Stock Distributed", "Total Stock"
)), .Names = c("ID", "CategoryI", "CategoryII", "CategoryIII",
"CategoryIV", "Group"), row.names = c(NA, -25L), class = "data.frame")