具有自定义分数功能的google-bigquery或amazon-redshift

时间:2016-10-03 22:55:27

标签: google-bigquery amazon-redshift

我们在oracle pl / sql中有以下代码,它使用Jaro-Winkler计算两个字符串之间的相似性。 我们尝试做的是,根据两个字符串的相似性找到任何重复。 我们的用户案例是用户可以输入用户信息,例如first_name,last_name但是有拼写错误,我们可以使用的唯一键是first_name,last_name。没有其他唯一标识符如SSN或电子邮件用于标识用户。 所以我们的想法是在first_name,last_name上进行自我连接,然后得到相似度的分数,并根据我们可以识别dup。

然而,即使有10,000个用户,也会有100,000,000个操作来进行匹配,我们在oracle db中尝试这个并且它太慢了。

我们是google-bigquery或Amazon Red-shift的新手。 是否有关于如何在我们的数据集中实现自定义函数的教程。

或google-bigquery或Amazon Red Shift已经有类似于oracle中的解决方案?

我们目前的环境不可能做到这个概念验证 所以我们喜欢在云中进行这项练习。

感谢您的帮助。

--http://www.orafaq.com/forum/t/164224/
CREATE OR REPLACE FUNCTION GKN_COMMON.jws -- Jaro-Winkler similarity
  (p_string1     IN VARCHAR2,
   p_string2     IN VARCHAR2)
  RETURN            NUMBER
  DETERMINISTIC
AS
  v_string1         VARCHAR2 (32767);
  v_string2         VARCHAR2 (32767);
  v_closeness       NUMBER := 0;
  v_temp            VARCHAR2 (32767);
  v_comp1           VARCHAR2 (32767);
  v_comp2           VARCHAR2 (32767);
  v_matches         NUMBER := 0; 
  v_char            VARCHAR2 (1);
  v_transpositions  NUMBER := 0;
  v_d_jaro          NUMBER := 0;
  v_leading         NUMBER := 0;
  v_d_winkler       NUMBER := 0;
  v_jws             NUMBER := 0;
BEGIN
  -- check for null strings:
  IF p_string1 IS NULL OR p_string2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- remove accents:
  v_string1 := translate (p_string1,
            '?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??',
            'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
  v_string2 := translate (p_string2,
            '?S?Zs?z?AAA?A??CEEEEIIII??OOO?O?UUUUY?aaa?a??ceeeeiiii???ooo?ouuuuyy??',
            'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
  -- closeness:
  v_closeness := (GREATEST (LENGTH (v_string1), LENGTH (v_string2)) / 2) - 1;
  -- find matching characters and transpositions within closeness:
  v_temp := v_string2;
  FOR i IN 1 .. LENGTH (v_string1) LOOP
    IF INSTR (v_temp, SUBSTR (v_string1, i, 1)) > 0 THEN
      v_char := SUBSTR (v_string1, i, 1);
      IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
        v_comp1 := v_comp1 || SUBSTR (v_string1, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string1, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string1, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  v_temp := v_string1;
  FOR i IN 1 .. LENGTH (v_string2) LOOP
    IF INSTR (v_temp, SUBSTR (v_string2, i, 1)) > 0 THEN
      v_char := SUBSTR (v_string2, i, 1);
      IF ABS (INSTR (v_string2, v_char) - INSTR (v_string1, v_char)) <= v_closeness THEN
        v_comp2 := v_comp2 || SUBSTR (v_string2, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string2, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string2, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  -- check for null strings:
  IF v_comp1 IS NULL OR v_comp2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- count matches and transpositions within closeness:
  FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
    IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
      v_matches := v_matches + 1;
    ELSE
      v_char := SUBSTR (v_comp1, i, 1);
      IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
        v_transpositions := v_transpositions + 1;
        v_matches := v_matches + 1;
      END IF; 
    END IF;
  END LOOP;
  v_transpositions := v_transpositions / 2;
  -- check for no matches:
  IF v_matches = 0
    THEN RETURN 0;
  END IF;
  -- Jaro:
  v_d_jaro := ((v_matches / LENGTH (v_string1)) + 
               (v_matches / LENGTH (v_string2)) +
               ((v_matches - v_transpositions) / v_matches)) 
               / 3;
  -- count matching leading characters (up to 4):
  FOR i IN 1 .. LEAST (LENGTH (v_string1), LENGTH (v_string2), 4) LOOP
    IF SUBSTR (v_string1, i, 1) = SUBSTR (v_string2, i, 1) THEN
      v_leading := v_leading + 1;
    ELSE
      EXIT;
    END IF;
  END LOOP;
  -- Winkler:
  v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
  -- Jaro-Winkler similarity rounded:
  v_jws := ROUND (v_d_winkler * 100);
  RETURN v_jws;
END jws;





 WITH
   strings AS
      (SELECT NULL          string1, NULL        string2 FROM DUAL UNION ALL
       SELECT 'test'       string1, NULL        string2 FROM DUAL UNION ALL
       SELECT NULL          string1, 'test'        string2 FROM DUAL UNION ALL
       SELECT 'CRATE'      string1, 'TRACE'        string2 FROM DUAL UNION ALL
       SELECT 'MARTHA'     string1, 'MARHTA'     string2 FROM DUAL UNION ALL
       SELECT 'DWAYNE'     string1, 'DUANE'        string2 FROM DUAL UNION ALL
       SELECT 'DIXON'      string1, 'DICKSONX'   string2 FROM DUAL UNION ALL
       SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
       SELECT 'Abroms'     string1, 'Abrams'     string2 FROM DUAL UNION ALL
       SELECT 'Lampley'    string1, 'Campley'    string2 FROM DUAL UNION ALL
       SELECT 'Jonathon'   string1, 'Jonathan'   string2 FROM DUAL UNION ALL
       SELECT 'Jeraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
       SELECT 'test'       string1, 'blank'        string2 FROM DUAL UNION ALL
       SELECT 'everybody'  string1, 'every'        string2 FROM DUAL UNION ALL
       SELECT 'a'          string1, 'aaa'        string2 FROM DUAL UNION ALL
       SELECT 'Géraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
       SELECT 'Jérôme'     string1, 'Jerome'     string2 FROM DUAL UNION ALL
       SELECT 'ça'          string1, 'ca'        string2 FROM DUAL UNION ALL
       SELECT 'Üwe'          string1, 'Uwe'        string2 FROM DUAL)
 SELECT string1, string2,
         --UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
         jws (string1, string2) my_jws
 FROM   strings
 ORDER  BY my_jws DESC

1 个答案:

答案 0 :(得分:4)

检查以下示例
它适用于带有标准SQL的BigQuery(选中Enabling Standard SQL)并使用JS User-Defined Functions

table#1 examples:
5585
985
445566
null

table#2 examples:
005585
000985
445566