
时间:2018-10-17 05:41:43

标签: r data.table tidyverse janitor


data <- tibble::tribble(~Countries, ~States,   ~Continents,
                        "Country 1",      1L, "continent 1",
                        "Country 1",      2L, "continent 1",
                        "Country 1",      3L, "continent 1",
                        "Country 1",      4L, "continent 1",
                        "Country 2",      1L, "continent 1",
                        "Country 2",      2L, "continent 1",
                        "Country 2",      3L, "continent 1",
                        "Country 2",      4L, "continent 1",
                        "Country 3",      1L, "continent 1",
                        "Country 3",      2L, "continent 1",
                        "Country 3",      3L, "continent 1",
                        "Country 3",      4L, "continent 1",
                        "Country 1",      1L, "continent 2",
                        "Country 1",      2L, "continent 2",
                        "Country 1",      3L, "continent 2",
                        "Country 1",      4L, "continent 2",
                        "Country 2",      1L, "continent 2",
                        "Country 2",      2L, "continent 2",
                        "Country 2",      3L, "continent 2",
                        "Country 2",      4L, "continent 2",
                        "Country 3",      1L, "continent 2",
                        "Country 3",      2L, "continent 2",
                        "Country 3",      3L, "continent 2",
                        "Country 3",      4L, "continent 2")


  1. 一种理解这一点的粗略方法可能是将变量数最少的变量保留在左侧(即大陆),将变量数最多的变量保留在数据集的右侧。

  2. 更容易弄清混乱数据的方法是创建某种树形图,并在顶部,大洲,此处和底部状态(在这里,如树叶)中看到最少的粒度数据/节点。


如果很难做到第二,那么我们如何至少做到第一呢? ...可以通过评估任何通用的混乱数据中每个变量的不同值,然后对变量进行排序!任何其他带有R代码的方法都将非常有帮助。


data <- tibble::tribble( ~Continents,  ~Countries,   ~States,
                         "continent 1", "Country 1",   1L,
                         "continent 1", "Country 1",   2L,
                         "continent 1", "Country 1",   3L,
                         "continent 1", "Country 1",   4L,
                         "continent 1", "Country 2",   1L,
                         "continent 1", "Country 2",   2L,
                         "continent 1", "Country 2",   3L,
                         "continent 1", "Country 2",   4L,
                         "continent 1", "Country 3",   1L,
                         "continent 1", "Country 3",   2L,
                         "continent 1", "Country 3",   3L,
                         "continent 1", "Country 3",   4L,
                         "continent 2", "Country 1",   1L,
                         "continent 2", "Country 1",   2L,
                         "continent 2", "Country 1",   3L,
                         "continent 2", "Country 1",   4L,
                         "continent 2", "Country 2",   1L,
                         "continent 2", "Country 2",   2L,
                         "continent 2", "Country 2",   3L,
                         "continent 2", "Country 2",   4L,
                         "continent 2", "Country 3",   1L,
                         "continent 2", "Country 3",   2L,
                         "continent 2", "Country 3",   3L,
                         "continent 2", "Country 3",   4L)

1 个答案:

答案 0 :(得分:1)


data[order(sapply(data, function(x) length(unique(x))))] # returns the data in the desired order

# simple function for plotting the 'tree'. 
plotTree <- function(lengths, names, space = 0.3){
  L    <- lengths[O <- order(lengths)]
  N    <- names[O]
  XMax <- max(L)
  YMax <- (length(L))
  plot(NULL, xlim = c(-XMax, XMax), ylim = c(-YMax, YMax), axes = F, xlab = "", ylab = "")
  for (i in 1:length(L)){
    rect(-L[i], YMax - 1 - i *  (space + 1), L[i], YMax - i *  (space + 1), col = i)
    text(0, YMax - 1/2 - i * (space + 1), N[i], col = if (i == 1) "white" else "black")

# usage
plotTree(sapply(data, function(x) length(unique(x))), names(data), space = 0.3)