例如,我搜索特定栏目中的某些文字,例如" abcdefgio"在数据库中,字段值为" abcdefghijk ..."所以我只想在搜索字符串中匹配70 - 80%。
请为我推荐一些算法或库。
答案 0 :(得分:2)
你可以使用levenshtein距离:
CREATE FUNCTION levenshtein( s1 text, s2 text )
RETURNS INT
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT;
DECLARE s1_char CHAR;
-- max strlen=255
DECLARE cv0, cv1 VARBINARY(10240);
SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0;
IF s1 = s2 THEN
RETURN 0;
ELSEIF s1_len = 0 THEN
RETURN s2_len;
ELSEIF s2_len = 0 THEN
RETURN s1_len;
ELSE
WHILE j <= s2_len DO
SET cv1 = CONCAT(cv1, UNHEX(HEX(j))), j = j + 1;
END WHILE;
WHILE i <= s1_len DO
SET s1_char = SUBSTRING(s1, i, 1), c = i, cv0 = UNHEX(HEX(i)), j = 1;
WHILE j <= s2_len DO
SET c = c + 1;
IF s1_char = SUBSTRING(s2, j, 1) THEN
SET cost = 0; ELSE SET cost = 1;
END IF;
SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost;
IF c > c_temp THEN SET c = c_temp; END IF;
SET c_temp = CONV(HEX(SUBSTRING(cv1, j+1, 1)), 16, 10) + 1;
IF c > c_temp THEN
SET c = c_temp;
END IF;
SET cv0 = CONCAT(cv0, UNHEX(HEX(c))), j = j + 1;
END WHILE;
SET cv1 = cv0, i = i + 1;
END WHILE;
END IF;
RETURN c;
END;
CREATE FUNCTION levenshtein_ratio( s1 text, s2 text )
RETURNS INT
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, max_len INT;
SET s1_len = LENGTH(s1), s2_len = LENGTH(s2);
IF s1_len > s2_len THEN
SET max_len = s1_len;
ELSE
SET max_len = s2_len;
END IF;
RETURN ROUND((1 - LEVENSHTEIN(s1, s2) / max_len) * 100);
END;
使用:
select *
from table1 t1
where levenshtein_ratio(t1.txt1, t1.txt2) > 80
另一种方法是,通过LIKE在MYSQL中搜索并在结果上使用函数levenshtein
答案 1 :(得分:1)
这可能要晚了,但可能仍然有用。 因此,有两种处理方法: 1. @MisterX声明的MySQL Levenshtein函数 2. PHP的相似文本功能
方法如下:
1。为MySQL使用Levenshtein函数
第1步。
从phpMyAdmin的终端或SQL界面中的mysql提示符中,使用以下代码创建LEVENSHTEIN函数(只需复制,粘贴并单击界面上的GO按钮)
DELIMITER $$
DROP FUNCTION IF EXISTS LEVENSHTEIN $$
CREATE FUNCTION LEVENSHTEIN(s1 VARCHAR(255) CHARACTER SET utf8, s2 VARCHAR(255) CHARACTER SET utf8)
RETURNS INT
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT;
DECLARE s1_char CHAR CHARACTER SET utf8;
-- max strlen=255 for this function
DECLARE cv0, cv1 VARBINARY(256);
SET s1_len = CHAR_LENGTH(s1),
s2_len = CHAR_LENGTH(s2),
cv1 = 0x00,
j = 1,
i = 1,
c = 0;
IF (s1 = s2) THEN
RETURN (0);
ELSEIF (s1_len = 0) THEN
RETURN (s2_len);
ELSEIF (s2_len = 0) THEN
RETURN (s1_len);
END IF;
WHILE (j <= s2_len) DO
SET cv1 = CONCAT(cv1, CHAR(j)),
j = j + 1;
END WHILE;
WHILE (i <= s1_len) DO
SET s1_char = SUBSTRING(s1, i, 1),
c = i,
cv0 = CHAR(i),
j = 1;
WHILE (j <= s2_len) DO
SET c = c + 1,
cost = IF(s1_char = SUBSTRING(s2, j, 1), 0, 1);
SET c_temp = ORD(SUBSTRING(cv1, j, 1)) + cost;
IF (c > c_temp) THEN
SET c = c_temp;
END IF;
SET c_temp = ORD(SUBSTRING(cv1, j+1, 1)) + 1;
IF (c > c_temp) THEN
SET c = c_temp;
END IF;
SET cv0 = CONCAT(cv0, CHAR(c)),
j = j + 1;
END WHILE;
SET cv1 = cv0,
i = i + 1;
END WHILE;
RETURN (c);
END $$
DELIMITER ;
第2步 使用以下代码创建levenshtein_ratio函数
DELIMITER $$
CREATE FUNCTION levenshtein_ratio( s1 text, s2 text )
RETURNS INT
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, max_len INT;
SET s1_len = LENGTH(s1), s2_len = LENGTH(s2);
IF s1_len > s2_len THEN
SET max_len = s1_len;
ELSE
SET max_len = s2_len;
END IF;
RETURN ROUND((1 - LEVENSHTEIN(s1, s2) / max_len) * 100);
END $$
DELIMITER ;
第3步:像这样在您的php代码中使用该功能:
$searchString = $_GET["search"];
$sql = "select * from table_name
where levenshtein_ratio($searchString, table_column) >= 70
&& levenshtein_ratio($searchString, table_column) <= 80
" ;
mysqli_query($conn,$sql)
$ conn引用您的数据库连接字符串的地方。
完成
您这样查询数据库:
$searchString = $_GET["search"];
$sql = "select * from table_name where table_column like %$searchString%";
$result = mysqli_query($conn,$sql);
$between70and80 = array();
while($row=mysqli_fetch_array($result))
{
$string1 = $row["column"];
$sim = similar_text(strtoupper($search), strtoupper($string1), $perc);
if($perc>=70 && $perc >=80)
{
array_push($between70and80,$string1);
}
}
// Now do what you want with $between70and80.
请注意我在相似文本功能中使用的 strtoupper ,我需要两个字符串都处于相同的大小写,因为这会影响百分比匹配。
希望这对某人有帮助。