我有以下数据框:
df <- data.frame(
var1 = c("A", "C", "C", "B", "D"),
val1 = c(.89, .99, .67, .88, .92),
var2 = c("B", "A", "D", "A", "B"),
val2 = c(.87, .95, .55, .84, .88),
var3 = c("C", "B", "B", "C", "A"),
val3 = c(.66, .55, .45, .81, .77),
var4 = c("D", "D", "A", "D", "C"),
val4 = c(.44, .33, .43, .77, .69),
stringsAsFactors = FALSE
)
df
# var1 val1 var2 val2 var3 val3 var4 val4
#1 A 0.89 B 0.87 C 0.66 D 0.44
#2 C 0.99 A 0.95 B 0.55 D 0.33
#3 C 0.67 D 0.55 B 0.45 A 0.43
#4 B 0.88 A 0.84 C 0.81 D 0.77
#5 D 0.92 B 0.88 A 0.77 C 0.69
我要完成的工作是,如果var1
是C
或D
,那么我想用var1
和{{ 1}},其对应值为A
。另外,对于满足此条件的行,我需要val1
,A
和var2
以及var3
,var4
和val2
保持其原始顺序。以下是我的预期输出:
val3
原始数据集中的行val4
将始终按降序排列(即# var1 val1 var2 val2 var3 val3 var4 val4
#1 A 0.89 B 0.87 C 0.66 D 0.44
#2 A 0.95 C 0.99 B 0.55 D 0.33
#3 A 0.43 C 0.67 D 0.55 B 0.45
#4 B 0.88 A 0.84 C 0.81 D 0.77
#5 A 0.77 D 0.92 B 0.88 C 0.69
> vals
> val1
> val2
),并且每个字母将只在该行中出现一次。
我可以通过一个相当笨拙的val3
循环来获得预期的输出:
val4
但是我希望有一个更优雅的(理想的矢量化)解决方案。
答案 0 :(得分:1)
这是向量化的解决方案。对于判断变量名称表示歉意。我将在几秒钟内添加评论。
> a_vars <- apply(df, 1, function(vec) which(vec == 'A')) # these are the columns in each row which contain an 'A'
> a_cols <- a_vars + 1 # these are the corresponding value columns
>
> bad_rows <- (1:nrow(df))[df$var1 %in% c('C', 'D')] # these are the rows which have a C or D in the first column
> shifts <- sequence(a_vars[bad_rows] - 1) # we'll need to shift certain values in each bad row; shift stores their columns
> bad_vals <- df$val1[a_cols] # these are the values in the column containing an A
>
> shift_vals <- df[cbind(rep(bad_rows, (a_vars[bad_rows] - 1)), shifts)] # these are the values which need to be shifted over
>
> df$val1[bad_rows] <- df[cbind(bad_rows, a_cols[df$var1 %in% c('C', 'D')])] # shift the values from the A columns into the first column in the bad rows
> df$var1[bad_rows] <- 'A' # and make those variables 'A's
> df[cbind(rep(bad_rows, (a_vars[bad_rows] - 1)), shifts + 2)] <- shift_vals # now put the shifting values into their correct columns
> df
var1 val1 var2 val2 var3 val3 var4 val4
1 A 0.89 B 0.87 C 0.66 D 0.44
2 A 0.95 C 0.99 B 0.55 D 0.33
3 A 0.43 C 0.67 D 0.55 B 0.45
4 B 0.88 A 0.84 C 0.81 D 0.77
5 A 0.77 D 0.92 B 0.88 C 0.69
答案 1 :(得分:1)
# using data.table
require(data.table)
dt <- as.data.table(df)
首先,我们需要转换数据:
# get max index (if you have more than 4 vars)
# other approaches could be used here
i <- max(as.integer(substr(grep('var', colnames(dt), value = T), 4, 4)))
# split the data by variables
x <- lapply(1:i, function(x) {
k <- dt[, grep(x, colnames(dt)), with = F]
setnames(k, c('var', 'val'))
k[, group := .I]
})
x <- rbindlist(x)
x
# var val group
# 1: A 0.89 1
# 2: C 0.99 2
# 3: C 0.67 3
# ---
# 18: A 0.43 3
# 19: D 0.77 4
# 20: C 0.69 5
我认为这是更易于操作的结构。也许您可以使用Google“长数据格式与宽数据格式” ...
# we can now calculate order index, representing your column order
setorder(x, group, -val)
x[, orderI := 1:.N, by = group]
x
# now your logic:
# add index for groups that have C D as first:
x[, CDisFirst := any(orderI == 1 & var %in% c('C', 'D')), by = group]
# add index that A need to be first
x[, aFirst := CDisFirst & var == 'A']
# order now by groups, aFirst and val
setorder(x, group, -aFirst, -val)
x[, newOrder := 1:.N, by = group] # adds newOrder
x
# var val group orderI CDisFirst aFirst newOrder
# 1: A 0.89 1 1 FALSE FALSE 1
# 2: B 0.87 1 2 FALSE FALSE 2
# 3: C 0.66 1 3 FALSE FALSE 3
# ---
# 18: D 0.92 5 1 TRUE FALSE 2
# 19: B 0.88 5 2 TRUE FALSE 3
# 20: C 0.69 5 4 TRUE FALSE 4
现在,如果您愿意(但我建议您使用此结构),我们可以将数据转换回宽格式:
x <- dcast(x, group ~ newOrder, value.var = c('var', 'val'))
# do some reformatting, if you want:
setnames(x, gsub('_', '', colnames(x)))
x[, group := NULL] # deletes group column
setcolorder(x, colnames(df))
x
# var1 val1 var2 val2 var3 val3 var4 val4
# 1: A 0.89 B 0.87 C 0.66 D 0.44
# 2: A 0.95 C 0.99 B 0.55 D 0.33
# 3: A 0.43 C 0.67 D 0.55 B 0.45
# 4: B 0.88 A 0.84 C 0.81 D 0.77
# 5: A 0.77 D 0.92 B 0.88 C 0.69
# test, if matches your results (after conversion to data.frame)
all.equal(df_new, as.data.frame(x))
# [1] TRUE
5万行原始数据的基准
system.time(original())
# user system elapsed
# 28.23 22.25 51.22
system.time(minem())
# user system elapsed
# 0.29 0.00 0.30
system.time(Joseph())
# user system elapsed
# 1.75 0.03 1.83
答案 2 :(得分:1)
这在data.table中非常简单-类似于minem的方法,具有更严格的熔解和dcast公式,以及经过改进的随机播放方法:
# data table package and setup
library(data.table)
setDT(df)
# set row index to work within rows
df[, rowid := 1:.N]
# melt data.table to allow for easier indexing
df <- melt(df, id.vars = "rowid", measure.vars = list(grep("var",colnames(df)),grep("val",colnames(df))), value.name = c("var","val"), variable.factor = FALSE)
df[, variable := as.integer(variable)]
# set index of A variables with C/D in starting position to 0
df[rowid %in% df[(var == "C" | var == "D") & variable == 1, rowid] & var == "A", variable := 0]
#shuffle
df <- df[order(rowid, variable)]
# reset index to 1..N instead of 0..N
df[, variable := 1:.N, by = .(rowid)]
# back to table format
df <- dcast(df, rowid ~ variable, value.var = list("var", "val"), sep = "")
# reorder
setcolorder(df,order(as.numeric(gsub("\\D+","",colnames(df)))))
df
var1 val1 var2 val2 var3 val3 var4 val4 rowid
1: A 0.89 B 0.87 C 0.66 D 0.44 1
2: A 0.95 C 0.99 B 0.55 D 0.33 2
3: A 0.43 C 0.67 D 0.55 B 0.45 3
4: B 0.88 A 0.84 C 0.81 D 0.77 4
5: A 0.77 D 0.92 B 0.88 C 0.69 5
答案 3 :(得分:0)
不确定这是否算是更优雅,但是:
for(i in 1:nrow(df)){
if(df$var1[i] == "C"){
# Holds val1 if var1 is "C"
oldval <- df$val1[i]
# Which column has the new value in it?
col <- which(df[i, ] == "A") + 1
# Replace the values
df[i, "var1"] <- "A"
df[i, "val1"] <- df[i, col]
df[i, (col - 1)] <- "C"
df[i, col] <- oldval
}
# To maintain original ordering
if(df$val3[i] > df$val2[i]){
# Hold the vars and values
vars <- df[i, paste0("var", 2:3)]
vals <- df[i, paste0("val", 2:3)]
# Replace the values
df[i, paste0("var", 2:3)] <- rev(vars)
df[i, paste0("val", 2:3)] <- rev(vals)
}
}