数组(3, 5, 1, 3, 5, 48, 4, 7, 13, 55, 65, 4, 7, 13, 32)
频繁的数字序列为(3, 5) f=2 + (4, 7, 13) f=2
任何算法或伪代码来找到它?
更新(1):
如果(7, 13)
也会出现,则会通过更新其频率将其包含在最长的
(4, 7, 13) f=3
等等......
更新(2):
如果(1,2,3,4,1,2,3,4,1,2,7,8,7,8,3,4,3,4,1,2)
,则输出应为(1,2,3,4)
& (3,4,1,2)
&安培; (7,8)
,要清楚地将每个数字视为一个单词,并且您希望找到最常用的短语
因此在很多短语中看到相同的单词是常见的,但是如果任何短语是任何其他短语的字符串
短语不应被视为短语,而是会更新每个短语的频率,包括它
答案 0 :(得分:7)
** 编辑 **:稍微好一点的实现,现在也会返回频率并有更好的序列过滤器。
function getFrequences($input, $minimalSequenceSize = 2) {
$sequences = array();
$frequences = array();
$len = count($input);
for ($i=0; $i<$len; $i++) {
$offset = $i;
for ($j=$i+$minimalSequenceSize; $j<$len; $j++) {
if ($input[$offset] == $input[$j]) {
$sequenceSize = 1;
$sequence = array($input[$offset]);
while (($offset + $sequenceSize < $j)
&& ($input[$offset+$sequenceSize] == $input[$j+$sequenceSize])) {
if (false !== ($seqIndex = array_search($sequence, $frequences))) {
// we already have this sequence, since we found a bigger one, remove the old one
array_splice($sequences, $seqIndex, 1);
array_splice($frequences, $seqIndex, 1);
}
$sequence[] = $input[$offset+$sequenceSize];
$sequenceSize++;
}
if ($sequenceSize >= $minimalSequenceSize) {
if (false !== ($seqIndex = array_search($sequence, $sequences))) {
$frequences[$seqIndex]++;
} else {
$sequences[] = $sequence;
$frequences[] = 2; // we have two occurances already
}
// $i += $sequenceSize; // move $i so we don't reuse the same sub-sequence
break;
}
}
}
}
// remove sequences that are sub-sequence of another frequence
// ** comment this to keep all sequences regardless **
$len = count($sequences);
for ($i=0; $i<$len; $i++) {
$freq_i = $sequences[$i];
for ($j=$i+1; $j<$len; $j++) {
$freq_j = $sequences[$j];
$freq_inter = array_intersect($freq_i, $freq_j);
if (count($freq_inter) != 0) {
$len--;
if (count($freq_i) > count($freq_j)) {
array_splice($sequences, $j, 1);
array_splice($frequences, $j, 1);
$j--;
} else {
array_splice($sequences, $i, 1);
array_splice($frequences, $i, 1);
$i--;
break;
}
}
}
}
return array($sequences, $frequences);
};
测试用例
header('Content-type: text/plain');
$input = array(3, 5, 1, 3, 5, 48, 4, 7, 13, 55, 3, 5, 65, 4, 7, 13, 32, 5, 48, 4, 7, 13);
list($sequences, $frequences) = getFrequences($input);
foreach ($sequences as $i => $s) {
echo "(" . implode(',', $s) . ') f=' . $frequences[$i] . "\n";
}
** 编辑 **:这是对该功能的更新。它几乎完全重写了......告诉我这是不是你要找的东西。我还添加了冗余检查,以防止计算相同的序列或子序列两次。
function getFrequences2($input, $minSequenceSize = 2) {
$sequences = array();
$last_offset = 0;
$last_offset_len = 0;
$len = count($input);
for ($i=0; $i<$len; $i++) {
for ($j=$i+$minSequenceSize; $j<$len; $j++) {
if ($input[$i] == $input[$j]) {
$offset = 1;
$sub = array($input[$i]);
while ($i + $offset < $j && $j + $offset < $len) {
if ($input[$i + $offset] == $input[$j + $offset]) {
array_push($sub, $input[$i + $offset]);
} else {
break;
}
$offset++;
}
$sub_len = count($sub);
if ($sub_len >= $minSequenceSize) {
// $sub must contain more elements than the last sequence found
// otherwise we will count the same sequence twice
if ($last_offset + $last_offset_len >= $i + $sub_len) {
// we already saw this sequence... ignore
continue;
} else {
// save offset and sub_len for future check
$last_offset = $i;
$last_offset_len = $sub_len;
}
foreach ($sequences as & $sequence) {
$sequence_len = count($sequence['values']);
if ($sequence_len == $sub_len && $sequence['values'] == $sub) {
//echo "Found add-full ".var_export($sub, true)." at $i and $j...\n";
$sequence['frequence']++;
break 2;
} else {
if ($sequence_len > $sub_len) {
$end = $sequence_len - $sub_len;
$values = $sequence['values'];
$slice_len = $sub_len;
$test = $sub;
} else {
$end = $sub_len - $sequence_len;
$values = $sub;
$slice_len = $sequence_len;
$test = $sequence['values'];
}
for ($k=0; $k<=$end; $k++) {
if (array_slice($values, $k, $slice_len) == $test) {
//echo "Found add-part ".implode(',',$sub)." which is part of ".implode(',',$values)." at $i and $j...\n";
$sequence['values'] = $values;
$sequence['frequence']++;
break 3;
}
}
}
}
//echo "Found new ".implode(',',$sub)." at $i and $j...\n";
array_push($sequences, array('values' => $sub, 'frequence' => 2));
break;
}
}
}
}
return $sequences;
};
答案 1 :(得分:1)
在Python3中
>>> from collections import Counter
>>> count_hash=Counter()
>>> T=(3, 5, 1, 3, 5, 48, 4, 7, 13, 55, 65, 4, 7, 13, 32)
>>> for i in range(2,len(T)+1):
... for j in range(len(T)+1-i):
... count_hash[T[j:j+i]]+=1
...
>>> for k,v in count_hash.items():
... if v >= 2:
... print(k,v)
...
(3, 5) 2
(4, 7, 13) 2
(7, 13) 2
(4, 7) 2
你需要过滤掉(7,13)和(4,7)吗?如果序列中还有(99,7,14)会发生什么?
Counter
就像一个哈希,用于跟踪我们看到每个子字符串的次数
两个嵌套的for循环生成T
的所有子字符串,使用count_hash
累加每个子字符串的计数。
最后的for循环过滤所有那些只发生一次的子串
这是一个带过滤器的版本
from collections import Counter
def substrings(t, minlen=2):
tlen = len(t)
return (t[j:j+i] for i in range(minlen, tlen+1) for j in range(tlen+1-i))
def get_freq(*t):
counter = Counter(substrings(t))
for k in sorted(counter, key=len):
v=counter[k]
if v < 2:
del counter[k]
continue
for t in substrings(k):
if t in counter:
if t==k:
continue
counter[k]+=counter[t]-v
del counter[t]
return counter
print(get_freq(3, 5, 1, 3, 5, 48, 4, 7, 13, 55, 65, 4, 7, 13, 32, 4, 7))
print(get_freq(1,2,3,4,1,2,3,4,1,2,7,8,7,8,3,4,3,4,1,2))
输出
Counter({(4, 7, 13): 3, (3, 5): 2})
Counter({(1, 2, 3, 4, 1, 2): 8, (7, 8): 2}) # Is this the right answer?
这就是为什么我问过滤如何对我在评论中给出的序列起作用的原因
答案 2 :(得分:0)
好的,只是开始讨论。