将字符串与来自不同字母表的符号进行比较

时间:2016-02-10 12:54:55

标签: string sas string-comparison

我想比较两个包含不同字母符号的字符串(例如俄语和英语)。我希望看起来相似的符号被认为是彼此相等的。

E.g。在单词“Mom”中,字母“o”来自英文字母(Unicode中的代码043E),而在世界上“Möm”字母“о”来自俄语字母(Unicode中的代码006F)。所以("Mom" = "Mоm") =>是假的,但我希望这是真的。是否有一些标准的SAS功能,或者我应该做一个宏来做它。

谢谢!

2 个答案:

答案 0 :(得分:1)

我会这样做:

首先我会制作地图。我的意思是俄语中的哪个字母对应于英语中的字母。例如:
б= b
в= v
...

我会将此地图存储在单独的表中或存储为macroVars。 然后我将创建一个带有tranwrd函数的宏循环,它循环通过创建的地图。

此处的示例可能就是这样。

data _null_;
    stringBefore = "без";
    stringAfter = tranwrd(stringBefore,"а","a");
    stringAfter = tranwrd(stringAfter,"б","b");
    stringAfter = tranwrd(stringAfter,"в","v");
...
run;

在转换之后,我想你可以比较你的字符串。

答案 1 :(得分:0)

我还编写了一些函数来处理键盘布局错误印刷。这是代码:

/***************************************************************************/
/* FUNCTION count_rus_letters RETURNS NUMBER OF CYRILLIC LETTERS IN STRING */
/***************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_rus_letters(string $);
length letter $2;

rus_count=0;

len=klength(string);

do i=1 to len;
  letter=ksubstr(string,i,1);
  if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
      "З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
      "С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
      "Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я") 
  then rus_count+1;
end;

return(rus_count);
endsub;
run;

/**************************************************************************/
/* FUNCTION count_eng_letters RETURNS NUMBER OF ENGLISH LETTERS IN STRING */
/**************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_eng_letters(string $);
length letter $2;

eng_count=0;

len=klength(string);

do i=1 to len;
  letter=ksubstr(string,i,1);
  if rank('A') <= rank(letter) <=rank('z') 
  then eng_count+1;
end;

return(eng_count);
endsub;
run;

/**************************************************************************/
/* FUNCTION is_string_russian RETURNS 1 IF NUMBER OF RUSSIAN SYMBOLS IN   */
/* STRING >= NUMBER OF ENGLISH SYMBOLS                                    */
/**************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION is_string_russian(string $);
length letter $2 result 8;

eng_count=0;
rus_count=0;

len=klength(string);

do i=1 to len;
  letter=ksubstr(string,i,1);
  if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
      "З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
      "С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
      "Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я") 
  then rus_count+1;
  if rank('A') <= rank(letter) <=rank('z') 
  then eng_count+1;
end;

if rus_count>=eng_count
then result=1;
else result=0;

return(result);
endsub;
run;

/**************************************************************************/
/* FUNCTION fix_layout_misprints REPLACES MISPRINTED SYMBOLS BY ANALYSING */
/* LANGUAGE OF THE STRING (FOR ENGLISH STRING RUSSIAN SYMBOLS ARE         */
/* REPLACED BY ENGLISH COPIES AND FOR RUSSIAN STRING SYMBOLS ARE          */
/* REPLACED BY RUSSIAN COPIES)                                            */
/**************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION fix_layout_misprints(string $) $ 1000;
length letter $2 result $1000;

eng_count=0;
rus_count=0;

len=klength(string);

do i=1 to len;
  letter=ksubstr(string,i,1);
  if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
      "З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
      "С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
      "Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я") 
  then rus_count+1;
  if rank('A') <= rank(letter) <=rank('z') 
  then eng_count+1;
end;

if rus_count>=eng_count
then result=ktranslate(string,"АаВЕеКкМОоРрСсТХх","AaBEeKkMOoPpCcTXx");
else result=ktranslate(string,"AaBEeKkMOoPpCcTXx","АаВЕеКкМОоРрСсТХх");

return(result);
endsub;
run;

/***********/
/* EXAMPLE */
/***********/
options cmplib=sasuser.userfuncs;
data _null_;
good_str="Иванов";
err_str="Ивaнов";
fixed_str=fix_layout_misprints(err_str);

put "Good string=" good_str;
put "Error string=" err_str;
put "Fixed string=" fixed_str;

rus_count_in_err=count_rus_letters(err_str);
put "Count or Cyrillic symbols in error string=" rus_count_in_err;

eng_count_in_err=count_eng_letters(err_str);
put "Count or English symbols in error string=" eng_count_in_err;

is_error_str_russian=is_string_russian(err_str);
put "Is error string language Russian=" is_error_str_russian;

if (good_str ne err_str) 
then put "Before clearing - strings are not equal to each other";

if (good_str = fixed_str) 
then put "After clearing - strings are equal to each other";
run;