MySQL将拉丁字符扁平化为非拉丁字符

时间:2015-04-19 10:10:28

标签: mysql string-conversion latin

我正在寻找一个MySQL函数来转换每个特殊拉丁字符(不要与latin1字符集混淆 - 我使用utf8)与其非拉丁语等价物来自标准的A-Z字母范围。

这就是我真正希望实现的目标:

'Zażółć gęślą jaźń' --> 'Zazolc gesla jazn'

我尝试了许多具有CONVERTCAST功能的不同特技。到目前为止没有成功。

我也非常深入地搜索,不幸的是,与字符转换相关的MySQL的每一个问题都是典型的latin1 default collation,这不是我正在寻找的


我的目标是通过TRIGGER ... AFTER INSERT即时将产品名称转换为 uri friendly 字符串,例如

Crème Brûlée --> creme_brulee

这些转换后的名称不仅可以作为SEO URI,还可以作为缩略图文件名的引用。

LOWER(@str)REPLACE(@str, ' ', '_')外,其中一个步骤是展平这些拉丁字符。

我更喜欢将这些转换后的名称保留在数据库中,尽管您可能从架构的角度了解一些更合适的方式吗?

1 个答案:

答案 0 :(得分:2)

嗯,Sami Kuhmonen对我的问题的评论几乎耗尽了所有可能性。

但我必须使用任何工作解决方案,因此我决定从GitHub复制PHP Doctrine's Doctrine_Inflector:unaccent方法。 MySQL实现定义看起来并不帅,但我想嵌套REPLACE()函数是在MySQL中执行多字符串替换的唯一方法。

这是整个TRIGGER

 CREATE DEFINER = CURRENT_USER TRIGGER `product_BEFORE_INSERT` BEFORE INSERT ON `product` FOR EACH ROW

 # new value
 SET NEW.name_uri = LOWER(

 REPLACE( 

 # ugly set of nested replaces
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
 REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(

 # the original value
 NEW.name,

 # every accented latin character represented by its ASCII code
 # is replaced with normalized letter
 CHAR(195,128), 'A'),  CHAR(195,129), 'A'), CHAR(195,130), 'A'),  CHAR(195,131), 'A'), CHAR(195,132), 'A'),  CHAR(195,133), 'A'),
 CHAR(195,135), 'C'),  CHAR(195,136), 'E'), CHAR(195,137), 'E'),  CHAR(195,138), 'E'), CHAR(195,139), 'E'),  CHAR(195,140), 'I'),
 CHAR(195,141), 'I'),  CHAR(195,142), 'I'), CHAR(195,143), 'I'),  CHAR(195,145), 'N'), CHAR(195,146), 'O'),  CHAR(195,147), 'O'),
 CHAR(195,148), 'O'),  CHAR(195,149), 'O'), CHAR(195,150), 'O'),  CHAR(195,153), 'U'), CHAR(195,154), 'U'),  CHAR(195,155), 'U'),
 CHAR(195,156), 'U'),  CHAR(195,157), 'Y'), CHAR(195,159), 's'),  CHAR(195,160), 'a'), CHAR(195,161), 'a'),  CHAR(195,162), 'a'),
 CHAR(195,163), 'a'),  CHAR(195,164), 'a'), CHAR(195,165), 'a'),  CHAR(195,167), 'c'), CHAR(195,168), 'e'),  CHAR(195,169), 'e'),
 CHAR(195,170), 'e'),  CHAR(195,171), 'e'), CHAR(195,172), 'i'),  CHAR(195,173), 'i'), CHAR(195,174), 'i'),  CHAR(195,175), 'i'),
 CHAR(195,177), 'n'),  CHAR(195,178), 'o'), CHAR(195,179), 'o'),  CHAR(195,180), 'o'), CHAR(195,181), 'o'),  CHAR(195,182), 'o'),
 CHAR(195,182), 'o'),  CHAR(195,185), 'u'), CHAR(195,186), 'u'),  CHAR(195,187), 'u'), CHAR(195,188), 'u'),  CHAR(195,189), 'y'),
 CHAR(195,191), 'y'),
 # Decompositions for Latin Extended-A
 CHAR(196,128), 'A'),  CHAR(196,129), 'a'), CHAR(196,130), 'A'),  CHAR(196,131), 'a'), CHAR(196,132), 'A'),  CHAR(196,133), 'a'),
 CHAR(196,134), 'C'),  CHAR(196,135), 'c'), CHAR(196,136), 'C'),  CHAR(196,137), 'c'), CHAR(196,138), 'C'),  CHAR(196,139), 'c'),
 CHAR(196,140), 'C'),  CHAR(196,141), 'c'), CHAR(196,142), 'D'),  CHAR(196,143), 'd'), CHAR(196,144), 'D'),  CHAR(196,145), 'd'),
 CHAR(196,146), 'E'),  CHAR(196,147), 'e'), CHAR(196,148), 'E'),  CHAR(196,149), 'e'), CHAR(196,150), 'E'),  CHAR(196,151), 'e'),
 CHAR(196,152), 'E'),  CHAR(196,153), 'e'), CHAR(196,154), 'E'),  CHAR(196,155), 'e'), CHAR(196,156), 'G'),  CHAR(196,157), 'g'),
 CHAR(196,158), 'G'),  CHAR(196,159), 'g'), CHAR(196,160), 'G'),  CHAR(196,161), 'g'), CHAR(196,162), 'G'),  CHAR(196,163), 'g'),
 CHAR(196,164), 'H'),  CHAR(196,165), 'h'), CHAR(196,166), 'H'),  CHAR(196,167), 'h'), CHAR(196,168), 'I'),  CHAR(196,169), 'i'),
 CHAR(196,170), 'I'),  CHAR(196,171), 'i'), CHAR(196,172), 'I'),  CHAR(196,173), 'i'), CHAR(196,174), 'I'),  CHAR(196,175), 'i'),
 CHAR(196,176), 'I'),  CHAR(196,177), 'i'), CHAR(196,178), 'IJ'), CHAR(196,179), 'ij'), CHAR(196,180), 'J'),  CHAR(196,181), 'j'),
 CHAR(196,182), 'K'),  CHAR(196,183), 'k'), CHAR(196,184), 'k'),  CHAR(196,185), 'L'), CHAR(196,186), 'l'),  CHAR(196,187), 'L'),
 CHAR(196,188), 'l'),  CHAR(196,189), 'L'), CHAR(196,190), 'l'),  CHAR(196,191), 'L'), CHAR(197,128), 'l'),  CHAR(197,129), 'L'),
 CHAR(197,130), 'l'),  CHAR(197,131), 'N'), CHAR(197,132), 'n'),  CHAR(197,133), 'N'), CHAR(197,134), 'n'),  CHAR(197,135), 'N'),
 CHAR(197,136), 'n'),  CHAR(197,137), 'N'), CHAR(197,138), 'n'),  CHAR(197,139), 'N'), CHAR(197,140), 'O'),  CHAR(197,141), 'o'),
 CHAR(197,142), 'O'),  CHAR(197,143), 'o'), CHAR(197,144), 'O'),  CHAR(197,145), 'o'), CHAR(197,146), 'OE'), CHAR(197,147), 'oe'),
 CHAR(197,148), 'R'),  CHAR(197,149), 'r'), CHAR(197,150), 'R'),  CHAR(197,151), 'r'), CHAR(197,152), 'R'),  CHAR(197,153), 'r'),
 CHAR(197,154), 'S'),  CHAR(197,155), 's'), CHAR(197,156), 'S'),  CHAR(197,157), 's'), CHAR(197,158), 'S'),  CHAR(197,159), 's'),
 CHAR(197,160), 'S'),  CHAR(197,161), 's'), CHAR(197,162), 'T'),  CHAR(197,163), 't'), CHAR(197,164), 'T'),  CHAR(197,165), 't'),
 CHAR(197,166), 'T'),  CHAR(197,167), 't'), CHAR(197,168), 'U'),  CHAR(197,169), 'u'), CHAR(197,170), 'U'),  CHAR(197,171), 'u'),
 CHAR(197,172), 'U'),  CHAR(197,173), 'u'), CHAR(197,174), 'U'),  CHAR(197,175), 'u'), CHAR(197,176), 'U'),  CHAR(197,177), 'u'),
 CHAR(197,178), 'U'),  CHAR(197,179), 'u'), CHAR(197,180), 'W'),  CHAR(197,181), 'w'), CHAR(197,182), 'Y'),  CHAR(197,183), 'y'),
 CHAR(197,184), 'Y'),  CHAR(197,185), 'Z'), CHAR(197,186), 'z'),  CHAR(197,187), 'Z'), CHAR(197,188), 'z'),  CHAR(197,189), 'Z'),
 CHAR(197,190), 'z'),  CHAR(197,191), 's'),

 # Euro Sign
 CHAR(226,130,172), 'E'),
 # GBP (Pound) Sign
 CHAR(194,163), ''),
 'Ä', 'Ae'), 'ä', 'ae'), 'Ü', 'Ue'), 'ü', 'ue'),
 'Ö', 'Oe'), 'ö', 'oe'), 'ß', 'ss'),
 # Norwegian characters
 'Å', 'Aa'), 'Æ', 'Ae'), 'Ø', 'O'), 'æ', 'a'), 'ø', 'o'), 'å', 'aa')

 , ' ', '_'));