我想为R中的数字生成sequential citation numbers。如果数字是连续的,则数字应用连字符分隔。否则数字用逗号分隔。例如,数字1, 2, 3, 5, 6, 8, 9, 10, 11 and 13
应为1-3,5,6,8-11,13
。
这个问题已经previously answered for c#了,我编写了一个适用于R的函数,但是这个函数可以改进。我发布此问题作为其他可能有类似需求的参考。如果您发现R的类似问题(我没有),请投票结束,我将删除该问题。
下面的功能不是很优雅,但似乎可以胜任。 如何让功能更短更优雅?
x <- c(1,2,3,5,6,8,9,10,11,13)
library(zoo) ## the function requires zoo::na.approx function
##' @title Generate hyphenated sequential citation from an integer vector
##' @param x integer vector giving citation or page numbers
##' @importFrom zoo na.approx
seq.citation <- function(x) {
## Result if lenght of the integer vector is 1.
if(length(x) == 1) return(x) else {
## Sort
x <- sort(x)
## Difference
df <- diff(x)
## Index to determine start and end points
ind <- c("start", rep("no", length(df)-1), "end")
ind[which(df > 1)] <- "end"
## Temporary start point vector
sts <- which(ind == "end") + 1
ind[sts[sts < length(ind)]] <- "start"
## Replace the first index element
ind[1] <- "start"
## Replace the last index element, if preceding one is "end"
if(ind[length(ind)-1] == "end") ind[length(ind)] <- "start"
## Groups for comma separation using "start" as the determining value.
grp <- rep(NA, length(x))
grp[which(ind == "start")] <- 1:length(grp[which(ind == "start")])
grp <- zoo::na.approx(grp, method = "constant", rule = 2)
## Split sequences by group
seqs <- split(x, grp)
seqs <- lapply(seqs, function(k) {
if(length(k) == 1) k else {
if(length(k) == 2) paste(k[1], k[2], sep = ",") else {
paste(k[1], k[length(k)], sep = "-")
}}
})
## Result
return(do.call("paste", c(seqs, sep = ",")))
}
}
seq.citation(x)
# [1] "1-3,5,6,8-11,13"
答案 0 :(得分:6)
您可以使用tapply
,
paste(tapply(x, cumsum(c(1, diff(x) != 1)), function(i)
ifelse(length(i) > 2, paste0(head(i, 1), '-', tail(i, 1)),
paste(i, collapse = ','))), collapse = ',')
[1] "1-3,5,6,8-11,13"
答案 1 :(得分:4)
这适用于您的示例,应该相当通用。
# get run lengths of differences, with max value of 2
r <- rle(c(1, pmin(diff(x), 2)))
# paste selected x values with appropriate separator
res <- paste0(x[c(1, cumsum(r$lengths))], c("-", ",")[r$values], collapse="")
# drop final character, which is a separator
res <- substr(res, 1, nchar(res)-1)
返回
res
[1] "1-3,5-6,8-11,13"
答案 2 :(得分:1)
当然,还有来自&#34; R.utils&#34;的seqToHumanReadable
函数。封装
library(R.utils)
seqToHumanReadable(x)
# [1] "1-3, 5, 6, 8-11, 13"
seqToHumanReadable(x, tau = 1) ## If you want 5-6 and not 5, 6
# [1] "1-3, 5-6, 8-11, 13"
结果的外观也可以控制:
seqToHumanReadable(x, delimiter = "...", collapse = " | ")
# [1] "1...3 | 5 | 6 | 8...11 | 13"
答案 3 :(得分:1)
通过避免ifelse
调用并替换一些paste0
/ paste
调用,与Imo's answer相比,您可以节省一些计算时间:
paste0(
tapply(x, cumsum(c(1, diff(x) != 1)), function(i){
len <- length(i)
if(len == 1)
i else sprintf(if(len == 2) "%d,%d" else "%d-%d", i[1], i[len])
}), collapse = ",")
#R> [1] "1-3,5,6,8-11,13"
这更快,如下所示:
# check computation time
bench::mark(
new = paste0(
tapply(x, cumsum(c(1, diff(x) != 1)), function(i){
len <- length(i)
if(len == 1)
i else sprintf(if(len == 2) "%d,%d" else "%d-%d", i[1], i[len])
}), collapse = ","),
Imo = paste(tapply(x, cumsum(c(1, diff(x) != 1)), function(i)
ifelse(length(i) > 2, paste0(head(i, 1), '-', tail(i, 1)),
paste(i, collapse = ','))), collapse = ','),
min_time = 1)
#R> # A tibble: 2 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#R> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#R> 1 new 81.6µs 85.9µs 11228. 24.1KB 22.8 8848 18 788ms
#R> 2 Imo 116.7µs 127µs 7613. 15.8KB 22.4 6123 18 804ms
# same with longer vector
set.seed(1)
x <- sort(sample.int(1e6, 1e5))
bench::mark(
new = paste0(
tapply(x, cumsum(c(1, diff(x) != 1)), function(i){
len <- length(i)
if(len == 1)
i else sprintf(if(len == 2) "%d,%d" else "%d-%d", i[1], i[len])
}), collapse = ","),
Imo = paste(tapply(x, cumsum(c(1, diff(x) != 1)), function(i)
ifelse(length(i) > 2, paste0(head(i, 1), '-', tail(i, 1)),
paste(i, collapse = ','))), collapse = ','),
min_time = 1)
#R> # A tibble: 2 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#R> <bch:expr> <bch:t> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#R> 1 new 341ms 355ms 2.58 29.5MB 6.88 3 8 1.16s
#R> 2 Imo 625ms 658ms 1.52 29.2MB 15.2 2 20 1.31s