使用索引

时间:2017-06-16 11:46:08

标签: r dataframe

我有一个数据框。我在叫他麻烦

> head(trouble)
            ID              Category
1    1.NA.NA.NA.NA                     A
2  1.1.NA.NA.TOTAL         Total under A
3   1.1.1.NA.TOTAL Of Which in 1s period
4   14.NA.NA.NA.NA                     B
5 14.1.NA.NA.TOTAL              No as B1
6   14.10.NA.NA.NA                And B2

我想使用隐藏在故障$ ID(变量)中的分层信息。 仔细看看!

> head(look[,c("ID.1", "Category", "Group")],6)
         ID.1              Category Group
1  1.NA.NA.NA                     A  <NA>
2   1.1.NA.NA         Total under A TOTAL
3    1.1.1.NA Of Which in 1s period TOTAL
4 14.NA.NA.NA                     B  <NA>
5  14.1.NA.NA              No as B1 TOTAL
6 14.10.NA.NA                And B2  <NA>

以上是最初的麻烦$ ID在最后一个分隔符(“。”)上被分开并重命名为ID&amp;基。

现在,我可以手动浏览trouble列,将其转换为以下内容:

ID          CategoryI   CategoryII     CategoryIII             Group
1.NA.NA.NA  A           <NA>           <NA>                    <NA>
1.1.NA.NA   A           Total under A  <NA>                    TOTAL
1.1.1.NA    A           Total under A  Of Which in 1s period   TOTAL

So my question is:我该如何自动执行此操作?

Samples:以下是访问示例trouble&amp;的链接。他的output

PS:这不仅仅是将单个列拆分为多个列。请不要混淆。

2 个答案:

答案 0 :(得分:0)

library(magrittr)

trouble <- read.table(text="ID              Category
1    1.NA.NA.NA.NA                     A
2  1.1.NA.NA.TOTAL         'Total under A'
3   1.1.1.NA.TOTAL 'Of Which in 1s period'
4   14.NA.NA.NA.NA                     B
5 14.1.NA.NA.TOTAL              'No as B1'
6   14.10.NA.NA.NA                'And B2'",stringsAsFactors = FALSE,header=TRUE)

look <-
  trouble$ID %>%
  strsplit("\\.") %>%
  lapply(function(x){c(paste(x[1:4],collapse="."),x[5])}) %>%
  do.call(rbind,.) %>%
  as.data.frame %>%
  setNames(c("ID.1","Group")) %>%
  cbind(trouble,.)

# ID              Category        ID.1 Group
# 1    1.NA.NA.NA.NA                     A  1.NA.NA.NA    NA
# 2  1.1.NA.NA.TOTAL         Total under A   1.1.NA.NA TOTAL
# 3   1.1.1.NA.TOTAL Of Which in 1s period    1.1.1.NA TOTAL
# 4   14.NA.NA.NA.NA                     B 14.NA.NA.NA    NA
# 5 14.1.NA.NA.TOTAL              No as B1  14.1.NA.NA TOTAL
# 6   14.10.NA.NA.NA                And B2 14.10.NA.NA    NA

get_3_cat <- function(v){c(v[1],paste(v[1:2],collapse="."),paste(v[1:3],collapse="."))}

look_and_codes <- look[,1] %>% 
  strsplit("\\.") %>% 
  lapply(get_3_cat) %>%
  do.call(rbind,.) %>%
  as.data.frame %>%
  setNames(paste0("code",1:3)) %>%
  cbind(look,.)

look_and_codes$IDclean <- gsub("\\.NA","",look_and_codes$ID.1)

# ID              Category        ID.1 Group code1 code2    code3 IDclean
# 1    1.NA.NA.NA.NA                     A  1.NA.NA.NA    NA     1  1.NA  1.NA.NA       1
# 2  1.1.NA.NA.TOTAL         Total under A   1.1.NA.NA TOTAL     1   1.1   1.1.NA     1.1
# 3   1.1.1.NA.TOTAL Of Which in 1s period    1.1.1.NA TOTAL     1   1.1    1.1.1   1.1.1
# 4   14.NA.NA.NA.NA                     B 14.NA.NA.NA    NA    14 14.NA 14.NA.NA      14
# 5 14.1.NA.NA.TOTAL              No as B1  14.1.NA.NA TOTAL    14  14.1  14.1.NA    14.1
# 6   14.10.NA.NA.NA                And B2 14.10.NA.NA    NA    14 14.10 14.10.NA   14.10

output <- look_and_codes %>% merge(look_and_codes[,c("IDclean","Category")] 
%>% setNames(c("code1","CategoryI")) %>% unique,all.x=TRUE) %>%
  merge(look_and_codes[,c("IDclean","Category")] %>% setNames(c("code2","CategoryII")) %>% unique,all.x=TRUE) %>%
  merge(look_and_codes[,c("IDclean","Category")] %>% setNames(c("code3","CategoryIII")) %>% unique,all.x=TRUE)

#        code3 code2 code1               ID              Category        ID.1 Group IDclean CategoryI    CategoryII           CategoryIII
#   1    1.1.1   1.1     1   1.1.1.NA.TOTAL Of Which in 1s period    1.1.1.NA TOTAL   1.1.1         A Total under A Of Which in 1s period
#   2   1.1.NA   1.1     1  1.1.NA.NA.TOTAL         Total under A   1.1.NA.NA TOTAL     1.1         A Total under A                  <NA>
#   3  1.NA.NA  1.NA     1    1.NA.NA.NA.NA                     A  1.NA.NA.NA    NA       1         A          <NA>                  <NA>
#   4  14.1.NA  14.1    14 14.1.NA.NA.TOTAL              No as B1  14.1.NA.NA TOTAL    14.1         B      No as B1                  <NA>
#   5 14.10.NA 14.10    14   14.10.NA.NA.NA                And B2 14.10.NA.NA    NA   14.10         B        And B2                  <NA>
#   6 14.NA.NA 14.NA    14   14.NA.NA.NA.NA                     B 14.NA.NA.NA    NA      14         B          <NA>                  <NA>

clean_output <- output[,c("ID.1","CategoryI","CategoryII","CategoryIII","Group")]
clean_output <- clean_output[match(clean_output$ID.1,look_and_codes$ID.1),]

#          ID.1 CategoryI    CategoryII           CategoryIII Group
# 3  1.NA.NA.NA         A          <NA>                  <NA>    NA
# 2   1.1.NA.NA         A Total under A                  <NA> TOTAL
# 1    1.1.1.NA         A Total under A Of Which in 1s period TOTAL
# 5 14.10.NA.NA         B        And B2                  <NA>    NA
# 6 14.NA.NA.NA         B          <NA>                  <NA>    NA
# 4  14.1.NA.NA         B      No as B1                  <NA> TOTAL

答案 1 :(得分:0)

这是一个具有挑战性的问题。下面的解决方案使用zoo::na.locf()最后一次观察结转),并在使用data.table melt()从宽到长格式重新整理数据后进行分组。

解决方案应该很容易适应任意数量的列。只有两个地方Category列的数量是硬编码的。

library(data.table)   # CRAN version 1.10.4 used
# define column names
Cats <- paste0("Cat", 1:4)
# create new columns by splitting ID
setDT(trouble)[, (c(Cats, "Group")) := tstrsplit(ID, ".", fixed = TRUE)]
# amend ID as requested by OP: 
# remove Group part from ID, keep only first 4 parts
trouble[, ID := stringr::str_extract(ID, "^(\\w+[.]){3}\\w+")]
# add row number
trouble[, rn := .I]
# reshape from wide to long
long <- melt(trouble, measure.vars = c(Cats, "Group"))
# replace "NA"
long[value == "NA",  value := NA]
# find level of each row
long[variable %in% Cats & !is.na(value), level := last(variable), rn]
# create new category column, fill with known values
long[variable == level, new := Category]
long[variable == "Group", new := value]
# fill remaining NAs where appropriate, keep NAs at begin of each group
long[order(variable, rn), new := zoo::na.locf(new, na.rm = FALSE), 
     .(variable, rleid(value))]
# reshape from long to wide
result <- dcast(long, rn + ID ~ variable, value.var = "new")[, rn := NULL][]
result

返回:

             ID          Cat1          Cat2                  Cat3                Cat4                       Group
 1:  1.NA.NA.NA             A            NA                    NA                  NA                          NA
 2:   1.1.NA.NA             A Total under A                    NA                  NA                       TOTAL
 3:    1.1.1.NA             A Total under A Of Which in 1s period                  NA                       TOTAL
 4: 14.NA.NA.NA             B            NA                    NA                  NA                          NA
 5:  14.1.NA.NA             B      No as B1                    NA                  NA                       TOTAL
 6: 14.10.NA.NA             B        And B2                    NA                  NA                          NA
 7:  14.10.1.NA             B        And B2             Then B2.1                  NA                          NA
 8:   14.10.1.1             B        And B2             Then B2.1                Male                    Children
 9:   14.10.1.1             B        And B2             Then B2.1                Male                      Adults
10:   14.10.1.2             B        And B2             Then B2.1              Female                    Children
11:   14.10.1.2             B        And B2             Then B2.1              Female                      Adults
12:   14.10.1.3             B        And B2             Then B2.1 Total {(9) to (12)}                    Children
13:   14.10.1.3             B        And B2             Then B2.1 Total {(9) to (12)}                      Adults
14: 16.NA.NA.NA Month Positon            NA                    NA                  NA                          NA
15:  16.1.NA.NA Month Positon        Group1                    NA                  NA                          NA
16:   16.1.1.NA Month Positon        Group1              Group1 A                  NA Balance From Previous Month
17:   16.1.1.NA Month Positon        Group1              Group1 A                  NA             Stocks Received
18:   16.1.1.NA Month Positon        Group1              Group1 A                  NA              Unusable Stock
19:   16.1.1.NA Month Positon        Group1              Group1 A                  NA           Stock Distributed
20:   16.1.1.NA Month Positon        Group1              Group1 A                  NA                 Total Stock
21:   16.1.2.NA Month Positon        Group1              Group1 B                  NA Balance From Previous Month
22:   16.1.2.NA Month Positon        Group1              Group1 B                  NA             Stocks Received
23:   16.1.2.NA Month Positon        Group1              Group1 B                  NA              Unusable Stock
24:   16.1.2.NA Month Positon        Group1              Group1 B                  NA           Stock Distributed
25:   16.1.2.NA Month Positon        Group1              Group1 B                  NA                 Total Stock
             ID          Cat1          Cat2                  Cat3                Cat4                       Group

数据

OP通过下载链接提供了输入数据和预期结果。

trouble <- structure(list(ID = c("1.NA.NA.NA.NA", "1.1.NA.NA.TOTAL", "1.1.1.NA.TOTAL", 
"14.NA.NA.NA.NA", "14.1.NA.NA.TOTAL", "14.10.NA.NA.NA", "14.10.1.NA.NA", 
"14.10.1.1.Children", "14.10.1.1.Adults", "14.10.1.2.Children", 
"14.10.1.2.Adults", "14.10.1.3.Children", "14.10.1.3.Adults", 
"16.NA.NA.NA.NA", "16.1.NA.NA.NA", "16.1.1.NA.Balance From Previous Month", 
"16.1.1.NA.Stocks Received", "16.1.1.NA.Unusable Stock", "16.1.1.NA.Stock Distributed", 
"16.1.1.NA.Total Stock", "16.1.2.NA.Balance From Previous Month", 
"16.1.2.NA.Stocks Received", "16.1.2.NA.Unusable Stock", "16.1.2.NA.Stock Distributed", 
"16.1.2.NA.Total Stock"), Category = c("A", "Total under A", 
"Of Which in 1s period", "B", "No as B1", "And B2", "Then B2.1", 
"Male", "Male", "Female", "Female", "Total {(9) to (12)}", "Total {(9) to (12)}", 
"Month Positon", "Group1", "Group1 A", "Group1 A", "Group1 A", 
"Group1 A", "Group1 A", "Group1 B", "Group1 B", "Group1 B", "Group1 B", 
"Group1 B")), .Names = c("ID", "Category"), row.names = c(NA, 
-25L), class = "data.frame")

output <- structure(list(ID = c("1.NA.NA.NA", "1.1.NA.NA", "1.1.1.NA", 
"14.NA.NA.NA", "14.1.NA.NA", "14.10.NA.NA", "14.10.1.NA", "14.10.1.1", 
"14.10.1.1", "14.10.1.2", "14.10.1.2", "14.10.1.3", "14.10.1.3", 
"16.NA.NA.NA", "16.1.NA.NA", "16.1.1.NA", "16.1.1.NA", "16.1.1.NA", 
"16.1.1.NA", "16.1.1.NA", "16.1.2.NA", "16.1.2.NA", "16.1.2.NA", 
"16.1.2.NA", "16.1.2.NA"), CategoryI = c("A", "A", "A", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "Month Positon", 
"Month Positon", "Month Positon", "Month Positon", "Month Positon", 
"Month Positon", "Month Positon", "Month Positon", "Month Positon", 
"Month Positon", "Month Positon", "Month Positon"), CategoryII = c(NA, 
"Total under A", "Total under A", NA, "No as B1", "And B2", "And B2", 
"And B2", "And B2", "And B2", "And B2", "And B2", "And B2", NA, 
"Group1", "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", 
"Group1", "Group1", "Group1", "Group1"), CategoryIII = c(NA, 
NA, NA, NA, NA, NA, "Then B2.1", "Then B2.1", "Then B2.1", "Then B2.1", 
"Then B2.1", "Then B2.1", "Then B2.1", NA, NA, "Group1 A", "Group1 A", 
"Group1 A", "Group1 A", "Group1 A", "Group1 B", "Group1 B", "Group1 B", 
"Group1 B", "Group1 B"), CategoryIV = c(NA, NA, NA, NA, NA, NA, 
NA, "Male", "Male", "Female", "Female", "Total {(9) to (12)}", 
"Total {(9) to (12)}", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA), Group = c(NA, "TOTAL", "TOTAL", NA, "TOTAL", NA, NA, 
"Children", "Adults", "Children", "Adults", "Children", "Adults", 
NA, NA, "Balance From Previous Month", "Stocks Received", "Unusable Stock", 
"Stock Distributed", "Total Stock", "Balance From Previous Month", 
"Stocks Received", "Unusable Stock", "Stock Distributed", "Total Stock"
)), .Names = c("ID", "CategoryI", "CategoryII", "CategoryIII", 
"CategoryIV", "Group"), row.names = c(NA, -25L), class = "data.frame")