Question

我有以下数据框：

df <- data.frame(
  var1 = c("A", "C", "C", "B", "D"),
  val1 = c(.89, .99, .67, .88, .92),
  var2 = c("B", "A", "D", "A", "B"),
  val2 = c(.87, .95, .55, .84, .88),
  var3 = c("C", "B", "B", "C", "A"),
  val3 = c(.66, .55, .45, .81, .77),
  var4 = c("D", "D", "A", "D", "C"),
  val4 = c(.44, .33, .43, .77, .69),
  stringsAsFactors = FALSE
)

df
#  var1 val1 var2 val2 var3 val3 var4 val4
#1    A 0.89    B 0.87    C 0.66    D 0.44
#2    C 0.99    A 0.95    B 0.55    D 0.33
#3    C 0.67    D 0.55    B 0.45    A 0.43
#4    B 0.88    A 0.84    C 0.81    D 0.77
#5    D 0.92    B 0.88    A 0.77    C 0.69

我要完成的工作是，如果var1是C或D，那么我想用var1和{{ 1}}，其对应值为A。另外，对于满足此条件的行，我需要val1，A和var2以及var3，var4和val2保持其原始顺序。以下是我的预期输出：

val3

原始数据集中的行val4将始终按降序排列（即# var1 val1 var2 val2 var3 val3 var4 val4 #1 A 0.89 B 0.87 C 0.66 D 0.44 #2 A 0.95 C 0.99 B 0.55 D 0.33 #3 A 0.43 C 0.67 D 0.55 B 0.45 #4 B 0.88 A 0.84 C 0.81 D 0.77 #5 A 0.77 D 0.92 B 0.88 C 0.69> vals> val1> val2），并且每个字母将只在该行中出现一次。

我可以通过一个相当笨拙的val3循环来获得预期的输出：

val4

但是我希望有一个更优雅的（理想的矢量化）解决方案。

Answer 1

这是向量化的解决方案。对于判断变量名称表示歉意。我将在几秒钟内添加评论。

> a_vars <- apply(df, 1, function(vec) which(vec == 'A')) # these are the columns in each row which contain an 'A'
> a_cols <- a_vars + 1 # these are the corresponding value columns
> 
> bad_rows <- (1:nrow(df))[df$var1 %in% c('C', 'D')] # these are the rows which have a C or D in the first column
> shifts <- sequence(a_vars[bad_rows] - 1) # we'll need to shift certain values in each bad row; shift stores their columns
> bad_vals <- df$val1[a_cols] # these are the values in the column containing an A 
> 
> shift_vals <- df[cbind(rep(bad_rows, (a_vars[bad_rows] - 1)), shifts)] # these are the values which need to be shifted over
> 
> df$val1[bad_rows] <- df[cbind(bad_rows, a_cols[df$var1 %in% c('C', 'D')])] # shift the values from the A columns into the first column in the bad rows
> df$var1[bad_rows] <- 'A' # and make those variables 'A's
> df[cbind(rep(bad_rows, (a_vars[bad_rows] - 1)), shifts + 2)] <- shift_vals # now put the shifting values into their correct columns
> df
  var1 val1 var2 val2 var3 val3 var4 val4
1    A 0.89    B 0.87    C 0.66    D 0.44
2    A 0.95    C 0.99    B 0.55    D 0.33
3    A 0.43    C 0.67    D 0.55    B 0.45
4    B 0.88    A 0.84    C 0.81    D 0.77
5    A 0.77    D 0.92    B 0.88    C 0.69

Answer 2

# using data.table
require(data.table)
dt <- as.data.table(df)

首先，我们需要转换数据：

# get max index (if you have more than 4 vars)
# other approaches could be used here
i <- max(as.integer(substr(grep('var', colnames(dt), value = T), 4, 4)))
# split the data by variables
x <- lapply(1:i, function(x) {
  k <- dt[, grep(x, colnames(dt)), with = F]
  setnames(k, c('var', 'val'))
  k[, group := .I]
  })
x <- rbindlist(x)
x
#     var  val group
#  1:   A 0.89     1
#  2:   C 0.99     2
#  3:   C 0.67     3
# ---
# 18:   A 0.43     3
# 19:   D 0.77     4
# 20:   C 0.69     5

我认为这是更易于操作的结构。也许您可以使用Google“长数据格式与宽数据格式” ...

# we can now calculate order index, representing your column order
setorder(x, group, -val)
x[, orderI := 1:.N, by = group]
x

# now your logic:
# add index for groups that have C D as first:
x[, CDisFirst := any(orderI == 1 & var %in% c('C', 'D')), by = group]
# add index that A need to be first
x[, aFirst := CDisFirst & var == 'A']
# order now by groups, aFirst and val
setorder(x, group, -aFirst, -val)
x[, newOrder := 1:.N, by = group] # adds newOrder
x
#     var  val group orderI CDisFirst aFirst newOrder
#  1:   A 0.89     1      1     FALSE  FALSE        1
#  2:   B 0.87     1      2     FALSE  FALSE        2
#  3:   C 0.66     1      3     FALSE  FALSE        3
# ---
# 18:   D 0.92     5      1      TRUE  FALSE        2
# 19:   B 0.88     5      2      TRUE  FALSE        3
# 20:   C 0.69     5      4      TRUE  FALSE        4

现在，如果您愿意（但我建议您使用此结构），我们可以将数据转换回宽格式：

x <- dcast(x, group ~ newOrder, value.var = c('var', 'val'))
# do some reformatting, if you want:
setnames(x, gsub('_', '', colnames(x)))
x[, group := NULL] # deletes group column
setcolorder(x, colnames(df))
x
#    var1 val1 var2 val2 var3 val3 var4 val4
# 1:    A 0.89    B 0.87    C 0.66    D 0.44
# 2:    A 0.95    C 0.99    B 0.55    D 0.33
# 3:    A 0.43    C 0.67    D 0.55    B 0.45
# 4:    B 0.88    A 0.84    C 0.81    D 0.77
# 5:    A 0.77    D 0.92    B 0.88    C 0.69

# test, if matches your results (after conversion to data.frame)
all.equal(df_new, as.data.frame(x))
# [1] TRUE

5万行原始数据的基准

system.time(original())
# user  system elapsed 
# 28.23   22.25   51.22
system.time(minem())
# user  system elapsed 
# 0.29    0.00    0.30 
system.time(Joseph())
# user  system elapsed 
# 1.75    0.03    1.83

Answer 3

这在data.table中非常简单-类似于minem的方法，具有更严格的熔解和dcast公式，以及经过改进的随机播放方法：

# data table package and setup
library(data.table)
setDT(df)

# set row index to work within rows
df[, rowid := 1:.N]

# melt data.table to allow for easier indexing
df <- melt(df, id.vars = "rowid", measure.vars = list(grep("var",colnames(df)),grep("val",colnames(df))), value.name = c("var","val"), variable.factor = FALSE)
df[, variable := as.integer(variable)]

# set index of A variables with C/D in starting position to 0 
df[rowid %in% df[(var == "C" | var == "D") & variable == 1, rowid] & var == "A", variable := 0]

#shuffle
df <- df[order(rowid, variable)]

# reset index to 1..N instead of 0..N
df[, variable := 1:.N, by = .(rowid)]

# back to table format
df <- dcast(df, rowid ~ variable, value.var = list("var", "val"), sep = "")

# reorder
setcolorder(df,order(as.numeric(gsub("\\D+","",colnames(df)))))

df
   var1 val1 var2 val2 var3 val3 var4 val4 rowid
1:    A 0.89    B 0.87    C 0.66    D 0.44     1
2:    A 0.95    C 0.99    B 0.55    D 0.33     2
3:    A 0.43    C 0.67    D 0.55    B 0.45     3
4:    B 0.88    A 0.84    C 0.81    D 0.77     4
5:    A 0.77    D 0.92    B 0.88    C 0.69     5

Answer 4

不确定这是否算是更优雅，但是：

for(i in 1:nrow(df)){
  if(df$var1[i] == "C"){
    # Holds val1 if var1 is "C"
    oldval <- df$val1[i]
    # Which column has the new value in it?
    col <- which(df[i, ] == "A") + 1
    # Replace the values
    df[i, "var1"] <- "A"
    df[i, "val1"] <- df[i, col]
    df[i, (col - 1)] <- "C"
    df[i, col] <- oldval
  }
  # To maintain original ordering
  if(df$val3[i] > df$val2[i]){
    # Hold the vars and values
    vars <- df[i, paste0("var", 2:3)]
    vals <- df[i, paste0("val", 2:3)]
    # Replace the values
    df[i, paste0("var", 2:3)] <- rev(vars)
    df[i, paste0("val", 2:3)] <- rev(vals)
  }
}

在数据框中重新排列按行值

4 个答案: