答案 0 :(得分:15)
答案 1 :(得分:0)
答案 2 :(得分:0)
答案 3 :(得分:0)
mysql全文搜索的质量(为此目的)很差,如果您的语言不是英语
trigram search为此任务提供了非常好的结果
postgreSQL有trigram index,它很容易使用:)
但如果您需要在mysql中执行此操作,请尝试使用此更新版本的Bill Karwin的答案:
- 每个三元组只存储一次
- 一个简单的php类使用数据
<?php
/*
# mysql table structure
CREATE TABLE `trigram2content` (
`trigram_id` int NOT NULL REFERENCES trigrams(id),
`content_type_id` int(11) NOT NULL,
`record_id` int(11) NOT NULL,
PRIMARY KEY (`content_type_id`,`trigram_id`,`record_id`)
);
#each trigram is stored only once
CREATE TABLE `trigrams` (
`id` int not null auto_increment,
`token` varchar(3) NOT NULL,
PRIMARY KEY (id),
UNIQUE token(token)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
SELECT count(*), record_id FROM trigrams t
inner join trigram2content c ON t.id=c.trigram_id
WHERE (
t.token IN ('loc','ock','ck ','blo',' bl', ' bu', 'bur', 'urn')
AND c.content_type_id = 0
)
GROUP by record_id
ORDER BY count(*) DESC
limit 20;
*/
class trigram
{
private $dbLink;
var $types = array(
array(0, 'name'),
array(1, 'city'));
function trigram()
{
//connect to db
$this->dbLink = mysql_connect("localhost", "username", "password");
if ($this->dbLink) mysql_select_db("dbname");
else mysql_error();
mysql_query("SET NAMES utf8;", $this->dbLink);
}
function get_type_value($type_name){
for($i=0; $i<count($this->types); $i++){
if($this->types[$i][1] == $type_name)
return $this->types[$i][0];
}
return "";
}
function getNgrams($word, $n = 3) {
$ngrams = array();
$len = mb_strlen($word, 'utf-8');
for($i = 0; $i < $len-($n-1); $i++) {
$ngrams[] = mysql_real_escape_string(mb_substr($word, $i, $n, 'utf-8'), $this->dbLink);
}
return $ngrams;
}
/**
input: array('hel', 'ell', 'llo', 'lo ', 'o B', ' Be', 'Bel', 'ell', 'llo', 'lo ', 'o ')
output: array(1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 8)
*/
private function getTrigramIds(&$t){
$u = array_unique($t);
$q = "SELECT * FROM trigrams WHERE token IN ('" . implode("', '", $u) . "')";
$query = mysql_query($q, $this->dbLink);
$n = mysql_num_rows($query);
$ids = array(); //these trigrams are already in db, they have id
$ok = array();
for ($i=0; $i<$n; $i++)
{
$row = mysql_fetch_array($query, MYSQL_ASSOC);
$ok []= $row['token'];
$ids[ $row['token'] ] = $row['id'];
}
$diff = array_diff($u, $ok); //these trigrams are not yet in the db
foreach($diff as $n){
mysql_query("INSERT INTO trigrams (token) VALUES('$n')", $this->dbLink);
$ids[$n]= mysql_insert_id();
}
//so many ids than items (if a trigram occurs more times in input, then it will occur more times in output as well)
$result = array();
foreach($t as $n){
$result[]= $ids[$n];
}
return $result;
}
function insertData($id, $data, $type){
$t = $this->getNgrams($data);
$id = intval($id);
$type = $this->get_type_value($type);
$tIds = $this->getTrigramIds($t);
$q = "INSERT INTO trigram2content (trigram_id, content_type_id, record_id) VALUES ";
$rows = array();
foreach($tIds as $n => $tid){
$rows[]= "($tid, $type, $id)";
}
$q .= implode(", ", $rows);
mysql_query($q, $this->dbLink);
}
function updateData($id, $data, $type){
mysql_query("DELETE FROM trigram2content WHERE record_id=".intval($id)." AND content_type_id=".$this->get_type_value($type), $this->dbLink);
$this->insertData($id, $data, $type);
}
function search($str, $type){
$tri = $this->getNgrams($str);
$max = count($tri);
$q = "SELECT count(*), count(*)/$max as score, record_id FROM trigrams t inner join trigram2content c ON t.id=c.trigram_id
WHERE (
t.token IN ('" . implode("', '", $tri) . "')
AND c.content_type_id = ".$this->get_type_value($type)."
)
GROUP by record_id
HAVING score >= 0.6
ORDER BY count(*) DESC
limit 20;";
$query = mysql_query($q, $this->dbLink);
$n = mysql_num_rows($query);
$result = array();
for ($i=0; $i<$n; $i++)
{
$row = mysql_fetch_array($query, MYSQL_ASSOC);
$result[] = $row;
}
return $result;
}
};
和用法:
$t = new trigram();
$t->insertData(1, "hello bello", "name");
$t->insertData(2, "hellllo Mammmma mia", "name");
print_r($t->search("helo", "name"));