<?php
$filename = "largefile.txt";
/* get content of $filename in $content */
$content = strtolower(file_get_contents($filename));
/* split $content into array of substrings of $content i.e wordwise */
$wordArray = preg_split('/[^a-z]/', $content, -1, PREG_SPLIT_NO_EMPTY);
/* "stop words", filter them */
$filteredArray = array_filter($wordArray, function($x){
return !preg_match("/^(.|a|an|and|the|this|at|in|or|of|is|for|to)$/",$x);
});
/* get associative array of values from $filteredArray as keys and their frequency count as value */
$wordFrequencyArray = array_count_values($filteredArray);
/* Sort array from higher to lower, keeping keys */
arsort($wordFrequencyArray);
这是我实现的代码,用于查找文件中不同单词的频率。 这很有效。
现在我想做的是,假设有10个文本文件。我想计算所有10个文件中单词的单词频率,即如果我想查找单词的频率&#34; stack&#34 ;在所有10个文件中,单词堆栈出现在所有文件中的次数。然后将为所有不同的单词执行此操作。
我已经为单个文件完成了它,但不能知道如何将它扩展到多个文件。 谢谢你的帮助,抱歉我的英文不好
答案 0 :(得分:2)
把你所拥有的东西放到一个功能和放大器中。使用foreach
循环为数组中的每个文件名调用它:
<?php
$wordFrequencyArray = array();
function countWords($file) use($wordFrequencyArray) {
/* get content of $filename in $content */
$content = strtolower(file_get_contents($filename));
/* split $content into array of substrings of $content i.e wordwise */
$wordArray = preg_split('/[^a-z]/', $content, -1, PREG_SPLIT_NO_EMPTY);
/* "stop words", filter them */
$filteredArray = array_filter($wordArray, function($x){
return !preg_match("/^(.|a|an|and|the|this|at|in|or|of|is|for|to)$/",$x);
});
/* get associative array of values from $filteredArray as keys and their frequency count as value */
foreach (array_count_values($filteredArray) as $word => $count) {
if (!isset($wordFrequencyArray[$word])) $wordFrequencyArray[$word] = 0;
$wordFrequencyArray[$word] += $count;
}
}
$filenames = array('file1.txt', 'file2.txt', 'file3.txt', 'file4.txt' ...);
foreach ($filenames as $file) {
countWords($file);
}
print_r($wordFrequencyArray);