Question

R中的

'order'在Stata中看起来像'排序'。这是一个数据集，例如（仅列出变量名称）：

v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18

这是我期望的输出：

v1 v2 v3 v4 v5 v7 v8 v9 v10 v11 v12 v17 v18 v13 v14 v15 v6 v16

在R中，我有两种方式：

data <- data[,c(1:5,7:12,17:18,13:15,6,16)]

OR

names <- c("v1", "v2", "v3", "v4", "v5", "v7", "v8", "v9", "v10", "v11", "v12",  "v17", "v18", "v13", "v14", "v15", "v6", "v16")
data <- data[names]

要在Stata中获得相同的输出，我可以运行2行：

order v17 v18, before(v13)
order v6 v16, last

在上面的理想数据中，我们可以知道我们想要处理的变量的位置。但在大多数实际情况中，我们有像'年龄''性别'这样的变量，没有位置指标，我们可能在一个数据集中有超过50个变量。那么Stata中'order'的优势就更明显了。我们不需要知道变量的确切位置，只需输入其名称：

order age, after(gender)

R中是否有基本功能来处理这个问题，还是我可以获得一个包？提前谢谢。

tweetinfo <- data.frame(uid=1:50, mid=2:51, annotations=3:52, bmiddle_pic=4:53, created_at=5:54, favorited=6:55, geo=7:56, in_reply_to_screen_name=8:57, in_reply_to_status_id=9:58, in_reply_to_user_id=10:59, original_pic=11:60, reTweetId=12:61, reUserId=13:62, source=14:63, thumbnail_pic=15:64, truncated=16:65)
noretweetinfo <- data.frame(uid=21:50, mid=22:51, annotations=23:52, bmiddle_pic=24:53, created_at=25:54, favorited=26:55, geo=27:56, in_reply_to_screen_name=28:57, in_reply_to_status_id=29:58, in_reply_to_user_id=30:59, original_pic=31:60, reTweetId=32:61, reUserId=33:62, source=34:63, thumbnail_pic=35:64, truncated=36:65)
retweetinfo <- data.frame(uid=41:50, mid=42:51, annotations=43:52, bmiddle_pic=44:53, created_at=45:54, deleted=46:55, favorited=47:56, geo=48:57, in_reply_to_screen_name=49:58, in_reply_to_status_id=50:59, in_reply_to_user_id=51:60, original_pic=52:61, source=53:62, thumbnail_pic=54:63, truncated=55:64)
tweetinfo$type <- "ti"
noretweetinfo$type <- "nr"
retweetinfo$type <- "rt"
gtinfo <- rbind(tweetinfo, noretweetinfo)
gtinfo$deleted=""
gtinfo <- gtinfo[,c(1:16,18,17)]
retweetinfo <- transform(retweetinfo, reTweetId="", reUserId="")
retweetinfo <- retweetinfo[,c(1:5,7:12,17:18,13:15,6,16)]
gtinfo <- rbind(gtinfo, retweetinfo)
write.table(gtinfo, file="C:/gtinfo.txt", row.names=F, col.names=T, sep="\t", quote=F)
# rm(list=ls(all=T))

Answer 1

因为我拖延和试验不同的东西，这是我掀起的一个功能。最终，它取决于append：

moveme <- function(invec, movecommand) {
  movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\\s+"), 
                        function(x) x[x != ""])
  movelist <- lapply(movecommand, function(x) {
    Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
    ToMove <- setdiff(x, Where)
    list(ToMove, Where)
  })
  myVec <- invec
  for (i in seq_along(movelist)) {
    temp <- setdiff(myVec, movelist[[i]][[1]])
    A <- movelist[[i]][[2]][1]
    if (A %in% c("before", "after")) {
      ba <- movelist[[i]][[2]][2]
      if (A == "before") {
        after <- match(ba, temp)-1
      } else if (A == "after") {
        after <- match(ba, temp)
      }    
    } else if (A == "first") {
      after <- 0
    } else if (A == "last") {
      after <- length(myVec)
    }
    myVec <- append(temp, values = movelist[[i]][[1]], after = after)
  }
  myVec
}

以下是一些表示数据集名称的示例数据：

x <- paste0("v", 1:18)

想象一下，我们现在想要“v3”和“v18”之前的“v3”，“v6”和“v16”，以及开头的“v5”：

moveme(x, "v17, v18 before v3; v6, v16 last; v5 first")
#  [1] "v5"  "v1"  "v2"  "v17" "v18" "v3"  "v4"  "v7"  "v8"  "v9"  "v10" "v11" "v12"
# [14] "v13" "v14" "v15" "v6"  "v16"

因此，对于名为“df”的data.frame来说，显而易见的用法是：

df[moveme(names(df), "how you want to move the columns")]

并且，对于名为“DT”的data.table（正如@mnel指出的那样，将更有效地记忆）：

setcolorder(DT, moveme(names(DT), "how you want to move the columns"))

请注意，复合移动由分号指定。

公认的举措是：

before（将指定的列移到另一个命名列之前）
after（将指定的列移到另一个命名列之后）
first（将指定列移至第一个位置）
last（将指定列移至最后位置）

Answer 2

我遇到了你的问题。我现在有代码提供：

move <- function(data,variable,before) {
  m <- data[variable]
  r <- data[names(data)!=variable]
  i <- match(before,names(data))
  pre <- r[1:i-1]
  post <- r[i:length(names(r))]
  cbind(pre,m,post)
}

# Example.
library(MASS)
data(painters)
str(painters)

# Move 'Expression' variable before 'Drawing' variable.
new <- move(painters,"Expression","Drawing")
View(new)

Answer 3

您可以编写自己的功能来执行此操作。

以下内容将使用与stata类似的语法为您提供列名的新订单

where是一个有4种可能性的命名列表
- list(last = T)
- list(first = T)
- list(before = x)其中x是有问题的变量名称
- list(after = x)其中x是有问题的变量名称
sorted = T将按字典顺序排序var_list（来自alphabetic命令

sequential

stata

该函数仅对名称起作用（一旦将data.frame对象作为data传递，并返回一个重新排序的名称列表

例如

stata.order <- function(var_list, where, sorted = F, data) {
    all_names = names(data)
    # are all the variable names in
    check <- var_list %in% all_names
    if (any(!check)) {
        stop("Not all variables in var_list exist within  data")
    }
    if (names(where) == "before") {
        if (!(where %in% all_names)) {
            stop("before variable not in the data set")
        }
    }
    if (names(where) == "after") {
        if (!(where %in% all_names)) {
            stop("after variable not in the data set")
        }
    }

    if (sorted) {
        var_list <- sort(var_list)
    }
    where_in <- which(all_names %in% var_list)
    full_list <- seq_along(data)
    others <- full_list[-c(where_in)]

    .nwhere <- names(where)
    if (!(.nwhere %in% c("last", "first", "before", "after"))) {
        stop("where must be a list of a named element first, last, before or after")
    }

    do_what <- switch(names(where), last = length(others), first = 0, before = which(all_names[others] == 
        where) - 1, after = which(all_names[others] == where))

    new_order <- append(others, where_in, do_what)
    return(all_names[new_order])
}

tmp <- as.data.frame(matrix(1:100, ncol = 10))

stata.order(var_list = c("V2", "V5"), where = list(last = T), data = tmp)

##  [1] "V1"  "V3"  "V4"  "V6"  "V7"  "V8"  "V9"  "V10" "V2"  "V5" 

stata.order(var_list = c("V2", "V5"), where = list(first = T), data = tmp)

##  [1] "V2"  "V5"  "V1"  "V3"  "V4"  "V6"  "V7"  "V8"  "V9"  "V10"

stata.order(var_list = c("V2", "V5"), where = list(before = "V6"), data = tmp)

##  [1] "V1"  "V3"  "V4"  "V2"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"

stata.order(var_list = c("V2", "V5"), where = list(after = "V4"), data = tmp)

##  [1] "V1"  "V3"  "V4"  "V2"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"

# throws an error
stata.order(var_list = c("V2", "V5"), where = list(before = "v11"), data = tmp)

## Error: before variable not in the data set

如果您想有效地重新排序内存（通过引用而不复制），请使用data.table

DT <- data.table(tmp)
# sets by reference, no copying
setcolorder(DT, stata.order(var_list = c("V2", "V5"), where = list(after = "V4"), 
    data = DT))

DT

##     V1 V3 V4 V2 V5 V6 V7 V8 V9 V10
##  1:  1 21 31 11 41 51 61 71 81  91
##  2:  2 22 32 12 42 52 62 72 82  92
##  3:  3 23 33 13 43 53 63 73 83  93
##  4:  4 24 34 14 44 54 64 74 84  94
##  5:  5 25 35 15 45 55 65 75 85  95
##  6:  6 26 36 16 46 56 66 76 86  96
##  7:  7 27 37 17 47 57 67 77 87  97
##  8:  8 28 38 18 48 58 68 78 88  98
##  9:  9 29 39 19 49 59 69 79 89  99
## 10: 10 30 40 20 50 60 70 80 90 100

Answer 4

目前还不清楚你想做什么，但你的第一句话让我假设你想要对数据集进行排序。

实际上，有一个内置的order函数，它返回有序序列的索引。你在找这个吗？

> x <- c(3,2,1)

> order(x)
[1] 3 2 1

> x[order(x)]
[1] 1 2 3

Answer 5

这应该给你相同的文件：

#snip
gtinfo <- rbind(tweetinfo, noretweetinfo)
gtinfo$deleted=""
retweetinfo <- transform(retweetinfo, reTweetId="", reUserId="")
gtinfo <- rbind(gtinfo, retweetinfo)
gtinfo <-gtinfo[,c(1:16,18,17)]
#snip

可以在R中实现Strata的顺序函数之类的函数，但我认为没有太多的需求。

Answer 6

包dplyr和函数dplyr::relocate是dplyr 1.0.0中引入的新动词，完全可以满足您的需求。

library(dplyr)

data %>% relocate(v17, v18, .before = v13)

data %>% relocate(v6, v16, .after = last_col())

data %>% relocate(age, .after = gender)

Stata'order'命令是否有等效的R函数？

6 个答案: