假设我有一个像这样的数据框
ITEM
1 X
2 A
3 B
4 C
5 A
6 F
7 U
8 A
9 B
10 C
11 F
12 U
如何获取最常见的行序列。在这种情况下,最常见的序列为A,B,C
,因为它出现在第2至4行和8至10行中。
我已经尝试过函数rle
以及此处找到的一些解决方案,但我并不幸运。我可以有建议,提示或套餐推荐吗?
答案 0 :(得分:1)
我猜您想要最长的不重叠子字符串。关于动态编程解决方案here,有一些很好的解释。
x = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U")
n = length(x)
m1 = sapply(x, function(i) sapply(x, function(j) as.integer(i == j)))
diag(m1) = 0
m1[lower.tri(m1)] = 0
m1
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 1 0 0 0 0
# B 0 0 0 0 0 0 0 0 1 0 0 0
# C 0 0 0 0 0 0 0 0 0 1 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 1 0
# U 0 0 0 0 0 0 0 0 0 0 0 1
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
m2 = m1
for (i in 2:nrow(m1)){
for (j in 2:nrow(m1)){
if (m1[i-1, j-1] == 1 & m1[i, j] == 1){
if (j - i > m2[i - 1, j - 1]){
m2[i, j] = m2[i - 1, j - 1] + m2[i, j]
m2[i - 1, j - 1] = 0
} else {
m2[i, j] = 0
}
}
}
}
m2
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 3 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 2
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
ans_len = max(m2)
inds = c(which(m2 == ans_len, arr.ind = TRUE)[,2])
lapply(inds, function(ind) x[(ind - ans_len + 1):ind])
# [[1]]
# [1] "A" "B" "C"
答案 1 :(得分:1)
混合了嵌套tidyverse
函数的apply
解决方案。该解决方案是通用的,它将报告出现频率最高的非平凡的连续序列,该序列至少出现两次-关联到更长的序列。
library(tidyverse)
# Data
x <- data.frame(ITEM = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U"), stringsAsFactors = F)
# convert x to vector
y <- x$ITEM
# Create list to check for sequence of each length 2 through n/2
l <- lapply(2:floor(length(y)/2), function(a) sapply(1:a, function(x) y[(0 + x):(length(y) - a + x)])) %>%
lapply(as.data.frame) %>%
setNames(sapply(2:(length(.) + 1), function(a) paste0("Consecutive", a)))
# Show most frequent sequence(s), choosing the longest
lapply(1:length(l), function(x) (as.data.frame(table(do.call(paste, l[[x]])), stringsAsFactors = F) %>%
dplyr::mutate(length = nchar(Var1)) %>%
dplyr::filter(length == max(length) & Freq == max(Freq) & Freq > 1)) ) %>%
.[which(sapply(., nrow) > 0)] %>%
dplyr::bind_rows() %>%
dplyr::filter(Freq == max(Freq)) %>%
dplyr::filter(length == max(length)) %>%
dplyr::rename(Sequence = Var1) %>%
dplyr::select(-length)
# Sequence Freq
#1 A B C 2
答案 2 :(得分:0)
这是一种基于循环的替代方法,由于要搜索的向量变长,它仍然不是特别可扩展,但比其他解决方案要快。
x <- paste0(df$ITEM, collapse = "")
nc <- nchar(x)
m <- matrix("", nc, nc)
min.p.length <- 2 # Minimum character length of patterns returned
for (i in 1:nc) {
for (j in 1:nc)
if ((j >= i) & (j - i <= nc/2) & (j - i >= min.p.length - 1))
m[i, j] <- substring(x, i, j)
}
tab <- table(m[m > 0])
tab[which(tab == max(tab))]
AB ABC BC FU
2 2 2 2