SAS中的Jaro-Winkler字符串比较功能

时间:2011-07-28 20:07:44

标签: sas jaro-winkler

SAS中是否存在Jaro-Winkler字符串比较的实现?

看起来Link King有Jaro-Winkler,但我更喜欢自己调用该函数的灵活性。

谢谢!

3 个答案:

答案 0 :(得分:5)

我知道没有jaro-winkler距离的内置函数。 @Itzy已经引用了我所知道的唯一的。如果您愿意,可以使用proc fcmp滚动自己的功能。我甚至会给你一个先于下面的代码。我只是试着按照维基百科上的文章。无论如何,它肯定不是Bill Winkler的strcmp.c文件的完美代表,并且可能有很多错误。

proc fcmp outlib=work.jaro.chars;

  subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
    outargs matchChars;
    /* Returns number of matched characters between 2 strings excluding blanks*/
    /* two chars from string1 and string2 are considered matching
       if they are no farther than floor(max(|s1|, |s2|)/2)-1  */

    str1_len = length(strip(string1));
    str2_len = length(strip(string2));

    allowedDist = floor(max(str1_len, str2_len)/2) -1;

    matchChars="";

    /* walk through string 1 and match characters to string2 */
    do i= 1 to str1_len;
      x=substr(string1,i,1);
      position = findc(string2,x ,max(1,i-allowedDist));
      if position > 0 then do;
          if position - i <= allowedDist then do;
          y=substr(string2,position,1);
          /* build list of matched characters */
          matchChars=cats(matchChars,y);
        end;
      end;
    end;
    matchChars = strip(matchChars);
  endsub;


  function jarotrans (string1 $ , string2 $ );
    ntrans = 0;
    ubnd = min(length(strip(string1)), length(strip(string2)));
    do i = 1 to ubnd;
      if substr(string1,i,1) ne substr(string2,i,1) then do;
        ntrans + 1;
      end;
    end;
    return(ntrans/2);
  endsub;

  function getPrefixlen( string1 $ , string2 $, maxprelen);
     /* get the length of the matching characters at the beginning */
     n = min(maxprelen, length(string1), length(string2));
     do i = 1 to n;
       if substr(string1,i,1) ne substr(string2,i,1) 
       then return(max(1,i-1));
     end;
  endsub;

  function jarodist(string1 $, string2 $);
    /* get number of matched characters */
    call jaromatch(string1, string2, m1);
    m1_len = length(m1);
    if m1_len = 0 then return(0);
    call jaromatch(string2, string1, m2);
    m2_len = length(m2);
    if m2_len = 0 then return(0);

    /* get number of transposed characters */
    ntrans = jarotrans(m1, m2);
    put m1_len= m2_len= ntrans= ;
    j_dist =  (m1_len/length(string1) 
             + m2_len/length(string2) 
             + (m1_len-ntrans)/m1_len )  /  3;
    return(j_dist);
  endsub;

  function jarowink( string1 $, string2 $, prefixscale);
    jarodist=jarodist(string1, string2);
    prelen=getPrefixlen(string1, string2, 4);
    if prelen = 0 then return(jarodist);
    else  return(jarodist + prelen * prefixscale * (1-jarodist));
  endsub;

run;quit;

/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;

/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;

答案 1 :(得分:3)

我修改并修正了cmjohns的代码。感谢他/她让我开始。温克勒出版 他的论文中的一些例子Winkler,W。E.(2006)。 “记录链接和当前概述 研究方向“。研究报告系列,RRS。(见表6)我用这些例子来测试我的代码。

proc fcmp outlib=work.jaro.chars;

  /* Returns matched characters between 2 strings. Two chars from string1 and string2
     are considered matching if they are no farther apart than 
     floor(max(|s1|, |s2|)/2)-1                                                      */
  function jaromatch(string1 $, string2 $) $ 40;
    length matchChars $ 40;

    str1_len = lengthn(string1);
    str2_len = lengthn(string2);

    allowedDist = floor(max(str1_len, str2_len) / 2) - 1;

    *** walk through string1 and match characters to string2 ***;
    matchChars="";
    do i= 1 to str1_len;
      *** get the part of string2 to search ***;
      allowed_start = max(1, i - allowedDist);      *** starting char position ***;
      allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1);

      *** find i char from string1 in string2 within the allowedDist ***;
      position = findc(allowed_str2, substr(string1, i, 1));
      if position > 0 
      then do;
    matchChars = cats(matchChars, substr(allowed_str2, position, 1));
    *** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***;
    substr(string2, allowed_start + position -1, 1) = '~';
      end;
    end;
    return(strip(matchChars));
  endsub;

  /* count the number of "half" transpositions */
  function jarotrans(string1 $, string2 $);
    ntrans = 0;
    do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2)));
      if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1;
    end;

    return(ntrans / 2);
  endsub;

  /* get the length of the matching characters at the beginning */
  function getPrefixlen(string1 $, string2 $, maxprelen);
    n = min(maxprelen, lengthn(string1), lengthn(string2));

    if n = 0
    then return(0);
    else do;
      do i = 1 to n;
    if substr(string1, i, 1) ne substr(string2, i, 1) 
    then return(i - 1);
      end;
      return(n);  *** all maxprelen characters match ***;
    end;
  endsub;

  /* calc the jaro distance */
  function jarodist(string1 $, string2 $);
    *** get number of matched characters in string1 ***;
    m1 = jaromatch(string1, string2);
    m1_len = lengthn(m1);
    if m1_len = 0 then return(0);

    *** get number of matched characters in string2 ***;
    m2 = jaromatch(string2, string1);
    m2_len = lengthn(m2);
    if m2_len = 0 then return(0);

    *** get number of transposed characters ***;
    ntrans = jarotrans(m1, m2);

    *** calc jaro distance ***;
    j_dist = (m1_len / lengthn(string1) +
          m2_len / lengthn(string2) +
          (m1_len - ntrans) / m1_len
         ) / 3;

    return(j_dist);
  endsub;

  /* calc the jaro-winkler distance */
  function jarowink(string1 $, string2 $, prefixscale);
    string1 = upcase(strip(string1));
    string2 = upcase(strip(string2));

    *** check for trivial case and calc JW if needed ***;
    if string1 = string2
    then return(1.0);
    else do;
      jarodist = jarodist(string1, string2);
      prelen = getPrefixlen(string1, string2, 4);
      return(jarodist + prelen * prefixscale * (1 - jarodist));
    end;
  endsub;

run;

*** tell SAS where to find the functions we just wrote ***;
option cmplib=work.jaro;

    /* test code */
data _null_;
  put 'SHACKLEFORD SHACKELFORD 0.982';
  jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1);
  put jw=;
  put;
  put 'DUNNINGHAM  CUNNIGHAM   0.896';
  jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1);
  put jw=;
  put;
  put 'NICHLESON   NICHULSON   0.956';
  jw = jarowink('NICHLESON', 'NICHULSON', 0.1);
  put jw=;
  put;
  put 'JONES       JOHNSON     0.832';
  jw = jarowink('JONES', 'JOHNSON', 0.1);
  put jw=;
  put;
  put 'MASSEY      MASSIE      0.933';
  jw = jarowink('MASSEY', 'MASSIE', 0.1);
  put jw=;
  put;
  put 'ABROMS      ABRAMS      0.922';
  jw = jarowink('ABROMS', 'ABRAMS', 0.1);
  put jw=;
  put; 
  put 'JERALDINE   GERALDINE   0.926';
  jw = jarowink('JERALDINE', 'GERALDINE', 0.1);
  put jw=;
  put;
  put 'MARHTA      MARTHA      0.961';
  jw = jarowink('MARHTA', 'MARTHA', 0.1);
  put jw=;
  put;
  put 'MICHELLE    MICHAEL     0.921';
  jw = jarowink('MICHELLE', 'MICHAEL', 0.1);
  put jw=;
  put;
  put 'JULIES      JULIUS      0.933';
  jw = jarowink('JULIES', 'JULIUS', 0.1);
  put jw=;
  put;
  put 'TANYA       TONYA       0.880';
  jw = jarowink('TANYA', 'TONYA', 0.1);
  put jw=;
  put;
  put 'DWAYNE      DUANE       0.840';
  jw = jarowink('DWAYNE', 'DUANE', 0.1);
  put jw=;
  put;
  put 'SEAN        SUSAN       0.805';
  jw = jarowink('SEAN', 'SUSAN', 0.1);
  put jw=;
  put;
  put 'JON         JOHN        0.933';
  jw = jarowink('JON', 'JOHN', 0.1);
  put jw=;
  put;
run;

答案 2 :(得分:2)

我不这么认为。它可以执行Levenshtein距离(complev函数)或广义编辑距离(compged),但我还没有看到任何其他编辑距离函数。

如果您已经决定在SAS中执行此操作,则可以在PROC IML中编写程序。