将普通R数据帧转换为参差不齐的格式(一个可用的)

时间:2016-12-25 19:56:57

标签: r dataframe tabular

R中,函数ftable()默认创建一个带有所谓的不规则外观的表:

data(UCBAdmissions)
ftable(UCBAdmissions)

...

                Dept   A   B   C   D   E   F
Admit    Gender                             
Admitted Male        512 353 120 138  53  22
         Female       89  17 202 131  94  24
Rejected Male        313 207 205 279 138 351
         Female       19   8 391 244 299 317

行和列是“参差不齐”的,因为标签仅在更改时显示,明显的惯例是从上到下读取行,从左到右读取列。 (https://cran.r-project.org/doc/manuals/r-devel/R-data.html#Flat-contingency-tables

问题:

我怎样才能得到同样的东西"衣衫褴褛的"正常data.frame对象的外观?

可重复示例:

before= data.frame(C1= c(rep("A", 5), rep("L", 2)),
                   C2= c("B", rep("E", 3), rep("K", 2), "L"),
                   C3= c("C", "F", rep("H", 5)),
                   C4= c("D", "G", "I", rep("J", 4)), 
                   stringsAsFactors = FALSE)

before

...

  C1 C2 C3 C4
1  A  B  C  D
2  A  E  F  G
3  A  E  H  I
4  A  E  H  J
5  A  K  H  J
6  L  K  H  J
7  L  L  H  J

一个函数如何将对象before转换为类after的新对象data.frame,该对象使用print(after)打印到控制台,如下所示。

  C1 C2 C3 C4
1  A  B  C  D
2     E  F  G
3        H  I
4           J
5     K  H  J
6  L  K  H  J
7     L  H  J

如果有必要,可以接受此演示文稿格式的遗漏数据丢失。

2 个答案:

答案 0 :(得分:0)

也许不是最优雅的解决方案(a。许多for循环,b。强制任何类型的列到字符,c。没有输入断言,d。慢等等),但是跟随函数{{ 1}}似乎基本上按照示例的要求工作:

rag_blank

...

## Task

before= data.frame(C1= c(rep("A", 5), rep("L", 2)),
                   C2= c("B", rep("E", 3), rep("K", 2), "L"),
                   C3= c("C", "F", rep("H", 5)),
                   C4= c("D", "G", "I", rep("J", 4)), 
                   stringsAsFactors = FALSE)

before


## Solution

library(dplyr)

rag_blank= function(x, cols= seq_along(x), blank= ":"){

  # Copy input
  res= x

  # 1st step: blank trailing cells
  for(df_col in cols){
    res[, df_col]= as.character(unlist(res[, df_col]))
    x[, df_col]= as.character(unlist(x[, df_col]))
    re= rle(unlist(res[, df_col]))
    re_df= data.frame(value= re$values,
                      length= re$lengths,
                      stringsAsFactors = F) %>%
      mutate(idx_start= cumsum(length) - length + 2,
             idx_end= idx_start + length -2)
    for(re_row in 1:nrow(re_df)){
      if(re_df$idx_start[re_row]<= re_df$idx_end[re_row]){
        res[(re_df$idx_start[re_row]:re_df$idx_end[re_row]), df_col]= blank
      }
    }
  }

  # 2nd step: restore value if blank, resp. changed from 1st step but left cell it is not blank
  for(df_col in cols[-1]){
    changed_before= res[, df_col]!= x[, df_col]
    left_not_changed= res[, df_col-1]== x[, df_col-1]
    to_change= changed_before & left_not_changed
    res[to_change, df_col]= x[to_change, df_col]
  }

  res
}

rag_blank(before)

在某些情况下应用空白是不合适的,那么这可能会有所帮助:

  C1 C2 C3 C4
1  A  B  C  D
2  :  E  F  G
3  :  :  H  I
4  :  :  :  J
5  :  K  H  J
6  L  K  H  J
7  :  L  H  J

...

rag_index= function(x){
  rag_blank(x) != x
}

rag_index(before)

更忙碌的例子:

        C1    C2    C3    C4
[1,] FALSE FALSE FALSE FALSE
[2,]  TRUE FALSE FALSE FALSE
[3,]  TRUE  TRUE FALSE FALSE
[4,]  TRUE  TRUE  TRUE FALSE
[5,]  TRUE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE
[7,]  TRUE FALSE FALSE FALSE

...

data("diamonds", package = "ggplot2")
print(rag_blank(x= head(diamonds, 30)), n= 100)

如果有更优雅的解决方案,请感谢您的反馈。

答案 1 :(得分:0)

这是我为此提出的一组功能:

# The main function
ragged <- function(indt, keys, blank = "") {
  require(data.table)
  indt <- setkeyv(as.data.table(indt), keys)
  vals <- setdiff(names(indt), keys)
  nams <- paste0(keys, "_copy")
  for (i in seq_along(nams)) {
    indt[, (nams[i]) := c(as.character(get(key(indt)[i])[1]),
                          rep(blank, .N-1)), by = eval(keys[seq(i)])]
  }
  out <- cbind(indt[, ..nams], indt[, ..vals])
  out <- setnames(out, nams, keys)[]
  ## There has to be a better way to do this than to store the original object and the resulting object
  out <- list(indt = indt[, (nams) := NULL][], out = out, keys = keys, blank = blank)
  class(out) <- c("ragged", class(out))
  out
}

# The print method
print.ragged <- function(x, ...) {
  print(x$out)
}

# Allowing for extraction
`[.ragged` <- function(inragged, ...) {
  out <- inragged$indt[...]
  out <- ragged(out, keys = intersect(inragged$keys, names(out)), blank = inragged$blank)
  out
}

它使用data.table包,并首先使用setkey对数据进行排序。我认为,如果要进行这种分层显示,则对数据进行排序是有意义的。

以下是before数据集的一些示例。

# Nesting just the first two columns.
ragged(before, c("C1", "C2"))
##    C1 C2 C3 C4
## 1:  A  B  C  D
## 2:     E  F  G
## 3:        H  I
## 4:        H  J
## 5:     K  H  J
## 6:  L  K  H  J
## 7:     L  H  J

# Nesting with all the columns and inserting a marker
ragged(before, names(before), ":")
##    C1 C2 C3 C4
## 1:  A  B  C  D
## 2:  :  E  F  G
## 3:  :  :  H  I
## 4:  :  :  :  J
## 5:  :  K  H  J
## 6:  L  K  H  J
## 7:  :  L  H  J

请注意,由于数据是在使用ragged之前进行排序的,因此在head(diamonds, 30)上使用此函数的结果将不同于您的方法。

ragged(head(diamonds, 30), names(diamonds), ":")
##     carat       cut color clarity depth table price    x    y    z
##  1:   0.2   Premium     E     SI2  60.2    62   345 3.79 3.75 2.27
##  2:  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
##  3:  0.22      Fair     E     VS2  65.1    61   337 3.87 3.78 2.49
##  4:     :   Premium     F     SI1  60.4    61   342 3.88 3.84 2.33
##  5:  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
##  6:     : Very Good     D     VS2  60.5    61   357 3.96 3.97  2.4
##  7:     :         :     E     VS2  63.8    55   352 3.85 3.92 2.48
##  8:     :         :     F     VS1  60.9    57   357 3.96 3.99 2.42
##  9:     :         :     G    VVS2  60.4    58   354 3.97 4.01 2.41
## 10:     :         :     H     VS1  59.4    61   338    4 4.05 2.39
## 11:     :         :     :       :    61    57   353 3.94 3.96 2.41
## 12:     :     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 13:     :         :     J     VS1  62.8    56   340 3.93  3.9 2.46
## 14:  0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47
## 15:     :         :     J    VVS2  62.8    57   336 3.94 3.96 2.48
## 16:     :   Premium     I     VS1  62.5    57   355 3.97 3.94 2.47
## 17:  0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53
## 18:  0.29   Premium     I     VS2  62.4    58   334  4.2 4.23 2.63
## 19:   0.3      Good     I     SI2  63.3    56   351 4.26  4.3 2.71
## 20:     :         :     J     SI1  63.4    54   351 4.23 4.29  2.7
## 21:     :         :     :       :  63.8    56   351 4.23 4.26 2.71
## 22:     :         :     :       :    64    55   339 4.25 4.28 2.73
## 23:     : Very Good     J     SI1  62.7    59   351 4.21 4.27 2.66
## 24:     :         :     :     VS2  62.2    57   357 4.28  4.3 2.67
## 25:     :     Ideal     I     SI2    62    54   348 4.31 4.34 2.68
## 26:  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 27:     : Very Good     J     SI1  58.1    62   353 4.44 4.47 2.59
## 28:     :         :     :       :  59.4    62   353 4.39 4.43 2.62
## 29:     :     Ideal     J     SI2  62.2    54   344 4.35 4.37 2.71
## 30:  0.32   Premium     E      I1  60.9    58   345 4.38 4.42 2.68
##     carat       cut color clarity depth table price    x    y    z

借助[.ragged函数,我们可以继续对ragged对象进行操作。例如:

ragged(head(diamonds, 30), c("cut", "color"), ":")[, mean(price), .(cut, color)]
##           cut color       V1
##  1:      Fair     E 337.0000
##  2:      Good     E 327.0000
##  3:         :     I 351.0000
##  4:         :     J 344.0000
##  5: Very Good     D 357.0000
##  6:         :     E 352.0000
##  7:         :     F 357.0000
##  8:         :     G 354.0000
##  9:         :     H 342.6667
## 10:         :     I 336.0000
## 11:         :     J 350.0000
## 12:   Premium     E 338.6667
## 13:         :     F 342.0000
## 14:         :     I 344.5000
## 15:     Ideal     E 326.0000
## 16:         :     I 348.0000
## 17:         :     J 342.0000