假设我有一个包含8个字母的字符向量,每个字符出现两次:
x <- rep(LETTERS[1:8],2)
set.seed(1)
y <- sample(x)
y
# [1] "E" "F" "A" "D" "C" "B" "C" "G" "F" "A" "B" "G" "E" "H" "D" "H"
我想找到每对字母之间的间隔。这里,interval是指两个相同字母之间的字母数。我可以这样手动执行此操作:
abs(diff(which(y=="A")))-1 #6
abs(diff(which(y=="D")))-1 #10
abs(diff(which(y=="H")))-1 #1
我写了一个for
循环来做这个......
res<-NULL
for(i in 1:8){ res[[i]] <- abs(diff(which(y==LETTERS[i])))-1 }
names(res)<-LETTERS[1:8]
res
# A B C D E F G H
# 6 4 1 10 11 6 3 1
但是,我想在具有很长向量的随机化过程中使用这种方法。速度对此至关重要 - 我想知道是否有人有尽可能快速解决这个问题的好主意。
答案 0 :(得分:16)
您需要设置索引向量,然后执行diff(索引向量)逐组操作。
以下是if (operation.trim().toUpperCase().equals("MULTIPLICATION") || operation.trim().equals("*"))
包中的内容:
data.table
此处的索引向量是特殊(内置)变量require(data.table)
yDT <- data.table(y)
yDT[,diff(.I)-1,keyby=y]
# y V1
# 1: A 6
# 2: B 4
# 3: C 1
# 4: D 10
# 5: E 11
# 6: F 6
# 7: G 3
# 8: H 1
,用于存储行号。
.I
keyby=y
个小组,按字母顺序对结果进行排序;交替使用y
,我们会看到结果按照组的首次出现排序。 (谢谢@Arun,指出这一点。)
基础R中的类似解决方案看起来像
by=y
答案 1 :(得分:13)
使用data.table::chmatch
要快得多。
library(data.table)
f <- function(x){
ux <- unique(x)
out <- length(x) - chmatch(ux, rev(x)) - chmatch(ux, x)
setNames(out, ux)
}
f(y)
# E F A D C B G H
#11 6 6 10 1 4 3 1
它比cmpalex
快2倍。
set.seed(007); xx = sample(rep(make.unique(rep_len(LETTERS, 1e3)), each = 2))
microbenchmark::microbenchmark(cmpalex(xx), f(xx), unit="relative")
#Unit: relative
# expr min lq mean median uq max neval
# cmpalex(xx) 2.402806 2.366553 2.33802 2.359145 2.324677 2.232852 100
# f(xx) 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000 100
R version 3.2.0 (2015-04-16)
Running under: Windows 8 x64 (build 9200)
other attached packages:
[1] data.table_1.9.5
答案 2 :(得分:12)
另一种选择:
alex = function(x)
{
ux = unique(x)
mux = match(x, ux)
ans = integer(length(ux))
for(i in seq_along(x)) ans[mux[i]] = i - ans[mux[i]]
return(setNames(ans - 1L, ux))
}
alex(y)
# E F A D C B G H
#11 6 6 10 1 4 3 1
与其他替代方案相比:
frank1 = function(x) tapply(1:length(x), x, diff) - 1
library(data.table)
frank2 = function(x) data.table(x)[, diff(.I) - 1, by = x]
jaehyeon = function(x) sapply(unique(x), function(X) abs(diff(which(x == X))) - 1)
library(data.table)
khashaa = function(x)
{
ux = unique(x)
setNames(length(x) - chmatch(ux, rev(x)) - chmatch(ux, x), ux)
}
khashaa_base = function(x)
{
ux = unique(x)
setNames(length(x) - match(ux, rev(x)) - match(ux, x), ux)
}
frank1(y)
# A B C D E F G H
# 6 4 1 10 11 6 3 1
frank2(y)
# x V1
#1: E 11
#2: F 6
#3: A 6
#4: D 10
#5: C 1
#6: B 4
#7: G 3
#8: H 1
jaehyeon(y)
# E F A D C B G H
#11 6 6 10 1 4 3 1
khashaa(y)
# E F A D C B G H
#11 6 6 10 1 4 3 1
khashaa_base(y)
# E F A D C B G H
#11 6 6 10 1 4 3 1
在基准测试中:
#compiled versions for all for consistency:
cmpalex = compiler::cmpfun(alex)
cmpfrank1 = compiler::cmpfun(frank1)
cmpfrank2 = compiler::cmpfun(frank2)
cmpjaehyeon = compiler::cmpfun(jaehyeon)
cmpkhashaa = compiler::cmpfun(khashaa)
cmpkhashaa_base = compiler::cmpfun(khashaa_base)
set.seed(007); xx = sample(rep(make.unique(rep_len(LETTERS, 1e3)), each = 2))
sort_by_names = function(x) x[order(names(x))]
sum(sort_by_names(alex(xx)) != frank1(xx))
#[1] 0
sum(alex(xx) != setNames(frank2(xx)[[2]], frank2(xx)[[1]]))
#[1] 0
sum(alex(xx) != jaehyeon(xx))
#[1] 0
sum(alex(xx) != khashaa(xx))
#[1] 0
sum(alex(xx) != khashaa_base(xx))
#[1] 0
microbenchmark::microbenchmark(alex(xx), cmpalex(xx),
frank1(xx), cmpfrank1(xx),
frank2(xx), cmpfrank2(xx),
jaehyeon(xx), cmpjaehyeon(xx),
khashaa(xx), cmpkhashaa(xx),
khashaa_base(xx), cmpkhashaa_base(xx), times = 20)
#Unit: microseconds
# expr min lq median uq max neval
# alex(xx) 3472.726 3620.1055 3764.005 4157.9445 5382.221 20
# cmpalex(xx) 1056.538 1074.6345 1115.177 1251.0720 2131.172 20
# frank1(xx) 19441.559 19858.8145 20356.808 21159.3035 27471.738 20
# cmpfrank1(xx) 19166.288 19566.4925 20572.222 21108.8430 22243.335 20
# frank2(xx) 12592.156 12931.6325 13337.057 14092.5725 24015.020 20
# cmpfrank2(xx) 12396.578 12861.3365 13376.904 14012.3575 14542.715 20
# jaehyeon(xx) 45313.525 46875.1900 47514.821 48728.3085 49513.578 20
# cmpjaehyeon(xx) 44899.401 46496.7365 47748.330 49561.9505 82592.347 20
# khashaa(xx) 189.314 204.1045 220.982 235.0760 259.959 20
# cmpkhashaa(xx) 190.010 201.3200 234.032 240.1225 389.415 20
# khashaa_base(xx) 295.802 315.1170 328.167 360.5320 1353.038 20
# cmpkhashaa_base(xx) 295.803 301.8930 317.901 332.8650 379.323 20
修改强> 包括/修复其他替代方案。字节代码编译仅改进了具有显式循环的函数;编译其他替代方案只是为了完整性。到目前为止,Khashaa的智能解决方案也是最快的。
答案 3 :(得分:3)
我会这样做。
sapply(unique(x), function(x) abs(diff(which(y==x)))-1)
A B C D E F G H
6 4 1 10 11 6 3 1