我想比较两个包含不同字母符号的字符串(例如俄语和英语)。我希望看起来相似的符号被认为是彼此相等的。
E.g。在单词“Mom”中,字母“o”来自英文字母(Unicode中的代码043E),而在世界上“Möm”字母“о”来自俄语字母(Unicode中的代码006F)。所以("Mom" = "Mоm")
=>是假的,但我希望这是真的。是否有一些标准的SAS功能,或者我应该做一个宏来做它。
谢谢!
答案 0 :(得分:1)
我会这样做:
首先我会制作地图。我的意思是俄语中的哪个字母对应于英语中的字母。例如:
б= b
в= v
...
我会将此地图存储在单独的表中或存储为macroVars。 然后我将创建一个带有tranwrd函数的宏循环,它循环通过创建的地图。
此处的示例可能就是这样。
data _null_;
stringBefore = "без";
stringAfter = tranwrd(stringBefore,"а","a");
stringAfter = tranwrd(stringAfter,"б","b");
stringAfter = tranwrd(stringAfter,"в","v");
...
run;
在转换之后,我想你可以比较你的字符串。
答案 1 :(得分:0)
我还编写了一些函数来处理键盘布局错误印刷。这是代码:
/***************************************************************************/
/* FUNCTION count_rus_letters RETURNS NUMBER OF CYRILLIC LETTERS IN STRING */
/***************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_rus_letters(string $);
length letter $2;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
end;
return(rus_count);
endsub;
run;
/**************************************************************************/
/* FUNCTION count_eng_letters RETURNS NUMBER OF ENGLISH LETTERS IN STRING */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_eng_letters(string $);
length letter $2;
eng_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
return(eng_count);
endsub;
run;
/**************************************************************************/
/* FUNCTION is_string_russian RETURNS 1 IF NUMBER OF RUSSIAN SYMBOLS IN */
/* STRING >= NUMBER OF ENGLISH SYMBOLS */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION is_string_russian(string $);
length letter $2 result 8;
eng_count=0;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
if rus_count>=eng_count
then result=1;
else result=0;
return(result);
endsub;
run;
/**************************************************************************/
/* FUNCTION fix_layout_misprints REPLACES MISPRINTED SYMBOLS BY ANALYSING */
/* LANGUAGE OF THE STRING (FOR ENGLISH STRING RUSSIAN SYMBOLS ARE */
/* REPLACED BY ENGLISH COPIES AND FOR RUSSIAN STRING SYMBOLS ARE */
/* REPLACED BY RUSSIAN COPIES) */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION fix_layout_misprints(string $) $ 1000;
length letter $2 result $1000;
eng_count=0;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
if rus_count>=eng_count
then result=ktranslate(string,"АаВЕеКкМОоРрСсТХх","AaBEeKkMOoPpCcTXx");
else result=ktranslate(string,"AaBEeKkMOoPpCcTXx","АаВЕеКкМОоРрСсТХх");
return(result);
endsub;
run;
/***********/
/* EXAMPLE */
/***********/
options cmplib=sasuser.userfuncs;
data _null_;
good_str="Иванов";
err_str="Ивaнов";
fixed_str=fix_layout_misprints(err_str);
put "Good string=" good_str;
put "Error string=" err_str;
put "Fixed string=" fixed_str;
rus_count_in_err=count_rus_letters(err_str);
put "Count or Cyrillic symbols in error string=" rus_count_in_err;
eng_count_in_err=count_eng_letters(err_str);
put "Count or English symbols in error string=" eng_count_in_err;
is_error_str_russian=is_string_russian(err_str);
put "Is error string language Russian=" is_error_str_russian;
if (good_str ne err_str)
then put "Before clearing - strings are not equal to each other";
if (good_str = fixed_str)
then put "After clearing - strings are equal to each other";
run;