PHP中的段落比较

时间:2011-01-18 21:34:22

标签: php text

我想知道...让我说我有一个网页抓取网上的文章。我得到的是标题和纯文本的文章。是否有PHP脚本或Web服务可以在它们之间关联文章?或者......是否有可以从段落生成关键字的PHP脚本?

我已经在JAVA中测试了一个可以运行的脚本,但也许有一个PHPclass可以帮助......

谢谢!

1 个答案:

答案 0 :(得分:1)

this answer中的函数可用于从文本中提取单词并将它们相互比较。粗略的例子:

// For better results grab the texts manually and paste them here.
$nyt = file_get_contents('http://www.nytimes.com/2011/01/19/technology/19apple.html?pagewanted=print');
$sfc = file_get_contents('http://www.sfgate.com/cgi-bin/article.cgi?f=/c/a/2011/01/19/BUAK1HARUL.DTL&type=business');

$nyt = strip_tags($nyt);
$sfc = strip_tags($sfc);

// stopwords from english snowball porter stemmer
$stopwordsFile = dirname(__FILE__).'/includes/stopwords_en.txt';
if (file_exists($stopwordsFile)) {
    $stopwords = file($stopwordsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
} else {
    $stopwords = array();
}

$nytWords = extractWords($nyt, 3, $stopwords);
$sfcWords = extractWords($sfc, 3, $stopwords);

$nyt2sfcCount = countKeywords($nytWords, $sfcWords, 4);
$sfc2nytCount = countKeywords($sfcWords, $nytWords, 4);

// absolute
print_r($nyt2sfcCount);
print_r($sfc2nytCount);

$nyt2sfcFactor = strlen($sfc) / strlen($nyt);
$sfc2nytFactor = strlen($nyt) / strlen($sfc);

print($nyt2sfcFactor . PHP_EOL);
print($sfc2nytFactor . PHP_EOL);

foreach ($nyt2sfcCount as $word => $count) {
    $nyt2sfcCountRel[$word] = $count * $nyt2sfcFactor;
}

foreach ($sfc2nytCount as $word => $count) {
    $sfc2nytCountRel[$word] = $count * $sfc2nytFactor;
}

// relative
print_r($nyt2sfcCountRel);
print_r($sfc2nytCount);
print_r($nyt2sfcCount);
print_r($sfc2nytCountRel);

// reduce
$nyt2sfcCountRed = array_intersect_key($nyt2sfcCount, $sfc2nytCount);
$sfc2nytCountRed = array_intersect_key($sfc2nytCount, $nyt2sfcCount);

// reduced absolute
print_r($nyt2sfcCountRed);
print_r($sfc2nytCountRed);

foreach ($nyt2sfcCountRed as $word => $count) {
    $nyt2sfcCountRedRel[$word] = $count * $nyt2sfcFactor;
}

foreach ($sfc2nytCountRed as $word => $count) {
    $sfc2nytCountRedRel[$word] = $count * $sfc2nytFactor;
}

// reduced relative
print_r($nyt2sfcCountRedRel);
print_r($sfc2nytCountRed);
print_r($nyt2sfcCountRed);
print_r($sfc2nytCountRedRel);