时间:2010-07-23 17:19:00

标签: sql mysql full-text-search query-optimization

4 个答案:

答案 0 :(得分:15)

答案 1 :(得分:0)

答案 2 :(得分:0)

答案 3 :(得分:0)

  1. mysql全文搜索的质量(为此目的)很差,如果您的语言不是英语

  2. trigram search为此任务提供了非常好的结果

  3. postgreSQL有trigram index,它很容易使用:)

  4. 但如果您需要在mysql中执行此操作,请尝试使用此更新版本的Bill Karwin的答案:

    - 每个三元组只存储一次

    - 一个简单的php类使用数据

    <?php
    
      /*
    
        # mysql table structure
        CREATE TABLE `trigram2content` (
    `trigram_id` int NOT NULL REFERENCES trigrams(id),
    `content_type_id` int(11) NOT NULL,
    `record_id` int(11) NOT NULL,
    PRIMARY KEY (`content_type_id`,`trigram_id`,`record_id`)
    );
    
    #each trigram is stored only once
    CREATE TABLE `trigrams` (
    `id` int not null auto_increment,
    `token` varchar(3) NOT NULL,
    PRIMARY KEY (id),
    UNIQUE token(token)
    ) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
    
    
    SELECT count(*), record_id FROM trigrams t
    inner join trigram2content c ON t.id=c.trigram_id
    WHERE (
    t.token IN ('loc','ock','ck ','blo',' bl', ' bu', 'bur', 'urn')
    AND c.content_type_id = 0
    )
    GROUP by record_id
    ORDER BY count(*) DESC
    limit 20;
    
    
    */
    class trigram
    {
    
        private $dbLink;
    
        var $types = array(
            array(0, 'name'),
            array(1, 'city'));
    
    
        function trigram()
        {
          //connect to db
          $this->dbLink = mysql_connect("localhost", "username", "password");
          if ($this->dbLink) mysql_select_db("dbname");
          else mysql_error();
    
          mysql_query("SET NAMES utf8;", $this->dbLink);
        }
    
        function get_type_value($type_name){
          for($i=0; $i<count($this->types); $i++){
              if($this->types[$i][1] == $type_name)
                  return $this->types[$i][0];
          }
          return "";
        }
    
        function getNgrams($word, $n = 3) {
            $ngrams = array();
            $len = mb_strlen($word, 'utf-8');
            for($i = 0; $i < $len-($n-1); $i++) {
                $ngrams[] = mysql_real_escape_string(mb_substr($word, $i, $n, 'utf-8'), $this->dbLink);
            }
            return $ngrams;
        }
    
        /**
        input: array('hel', 'ell', 'llo', 'lo ', 'o B', ' Be', 'Bel', 'ell', 'llo', 'lo ', 'o  ')
        output: array(1,     2,     3,      4,      5,      6,      7,     2,   3,  4,      8)
        */
        private function getTrigramIds(&$t){
            $u = array_unique($t);
            $q = "SELECT * FROM trigrams WHERE token IN ('" . implode("', '", $u) . "')";
    
            $query = mysql_query($q, $this->dbLink);
            $n = mysql_num_rows($query);
    
            $ids = array(); //these trigrams are already in db, they have id
            $ok = array();
    
            for ($i=0; $i<$n; $i++)
            {
              $row = mysql_fetch_array($query, MYSQL_ASSOC);
              $ok []= $row['token'];
              $ids[ $row['token'] ] = $row['id'];
            }
            $diff = array_diff($u, $ok); //these trigrams are not yet in the db
            foreach($diff as $n){
                mysql_query("INSERT INTO trigrams (token) VALUES('$n')", $this->dbLink);
                $ids[$n]= mysql_insert_id();
            }
    
            //so many ids than items (if a trigram occurs more times in input, then it will occur more times in output as well)
            $result = array();
            foreach($t as $n){
                $result[]= $ids[$n];
            }
            return $result;
        }
    
        function insertData($id, $data, $type){
            $t = $this->getNgrams($data);
    
            $id = intval($id);
            $type = $this->get_type_value($type);
            $tIds = $this->getTrigramIds($t);
            $q = "INSERT INTO trigram2content (trigram_id, content_type_id, record_id) VALUES ";
            $rows = array();
            foreach($tIds as $n => $tid){
                $rows[]= "($tid, $type, $id)";
            }
            $q .= implode(", ", $rows);
            mysql_query($q, $this->dbLink);
        }
    
        function updateData($id, $data, $type){
            mysql_query("DELETE FROM trigram2content WHERE record_id=".intval($id)." AND content_type_id=".$this->get_type_value($type), $this->dbLink);
            $this->insertData($id, $data, $type);
        }
    
        function search($str, $type){
    
            $tri = $this->getNgrams($str);
            $max = count($tri);
            $q = "SELECT count(*), count(*)/$max as score, record_id FROM trigrams t inner join trigram2content c ON t.id=c.trigram_id
    WHERE (
    t.token IN ('" . implode("', '", $tri) . "')
    AND c.content_type_id = ".$this->get_type_value($type)."
    )
    GROUP by record_id
    HAVING score >= 0.6
    ORDER BY count(*) DESC
    limit 20;";
            $query = mysql_query($q, $this->dbLink);
            $n = mysql_num_rows($query);
    
            $result = array();
            for ($i=0; $i<$n; $i++)
            {
              $row = mysql_fetch_array($query, MYSQL_ASSOC);
              $result[] = $row;
            }
            return $result;
        }
    
    
    };
    
  5. 和用法:

     $t = new trigram();
    
     $t->insertData(1, "hello bello", "name");
     $t->insertData(2, "hellllo Mammmma mia", "name");
    
      print_r($t->search("helo", "name"));