How to transform path/string by collapsing repeated elements?

时间:2016-08-23 15:35:18

标签: google-bigquery

There is a filed in my table that represents pathways like below:

Item1->Item1->Item2-> Item3->Item3->Item3->Item1

In most cases this is quite looong sequence with many instances of same consecutive Items.

How I can shorted above path to something like below? in BigQuery!

Item1(x2)->Item2->Item3(x3)->Item1  

2 个答案:

答案 0 :(得分:1)

我想说服自己,这可能只是通过数组操作(使用标准SQL),我想出了一个解决方案。解决问题的另一种方法是使用分析函数,您可以在其中检测路径中项目的更改。

CREATE TEMPORARY FUNCTION PartsToString(
    parts_and_offsets ARRAY<STRUCT<part STRING, off INT64>>) AS ((
  SELECT
    STRING_AGG(
      CONCAT(part_and_offset.part,
             IF(parts_and_offsets[OFFSET(off + 1)].off - part_and_offset.off = 1,
             "",
             CONCAT("(x", CAST(parts_and_offsets[OFFSET(off + 1)].off - part_and_offset.off AS STRING), ")"))))
  FROM UNNEST(parts_and_offsets) AS part_and_offset WITH OFFSET off
  WHERE off + 1 < ARRAY_LENGTH(parts_and_offsets)
));

CREATE TEMPORARY FUNCTION PathwayToParts(pathway STRING) AS ((
  SELECT
    ARRAY_CONCAT(
      ARRAY_AGG(
        STRUCT(part, off)),
        [STRUCT("" AS part, ARRAY_LENGTH(ANY_VALUE(parts)) AS off)]) AS parts_and_offsets
  FROM (SELECT SPLIT(pathway, "->") AS parts),
    UNNEST(parts) AS part WITH OFFSET off
  WHERE off = 0 OR part != parts[OFFSET(off - 1)]
));

WITH YourTable AS (
  SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway 
  UNION ALL SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway
  UNION ALL SELECT "Item1->Item1->Item1" AS pathway
  UNION ALL SELECT "Item1->Item2->Item2" AS pathway
  UNION ALL SELECT "Item1->Item1->Item2" AS pathway
  UNION ALL SELECT "Item1->Item2->Item3" AS pathway
)
SELECT PartsToString(PathwayToParts(pathway)) AS parts_string
FROM YourTable;

答案 1 :(得分:0)

Using Scalar JS UDF (Standard SQL) <-- would be my choice

CREATE TEMPORARY FUNCTION collapse_repeated(pathway STRING) 
RETURNS STRING LANGUAGE js AS """
  var items = pathway.split('->');
  short = ''; elem = items[0]; count = 0;
  for (var i = 0; i < items.length; i++) {
    if (items[i] !== elem) {
      if (short.length > 0) {short += '->'}
      short += elem; if (count > 1) {short += '(x' + count.toString() + ')';}
      elem = items[i]; count = 1;
    } else {
      count++;
    }
  }
  if (short.length > 0) {short += '->'}
  short += elem; if (count > 1) {short += '(x' + count.toString() + ')';}
return short;
""";

WITH YourTable AS (
  SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway 
  UNION ALL SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway
  UNION ALL SELECT "Item1->Item1->Item1" AS pathway
  UNION ALL SELECT "Item1->Item2->Item2" AS pathway
)
SELECT collapse_repeated(pathway) AS shorten_pathway, pathway 
FROM YourTable  

Note: Same JS can be easily “translated” to JS UDF in Legacy SQL

Using Window Functions (Legacy SQL)

SELECT GROUP_CONCAT_UNQUOTED(IF(repeats=1, item, CONCAT(item, "(x", STRING(repeats), ")")), "->"), pathway
FROM (
  SELECT MIN(pos) AS ord, MIN(item) AS item, COUNT(1) AS repeats, pathway
  FROM (
    SELECT item, pos, IFNULL(grp, 0)AS grp, pathway FROM (
      SELECT item, pos, SUM(change) OVER(PARTITION BY pathway ORDER BY pos ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp, pathway
      FROM (
        SELECT item, pos, IF(item=next_item, 0, 1) AS change, pathway FROM (
          SELECT item, pos, LEAD(item) OVER(PARTITION BY pathway ORDER BY pos) AS next_item, pathway
          FROM (
            SELECT item, POSITION(item) AS pos, pathway FROM (
              SELECT SPLIT(pathway, "->") AS item, pathway FROM 
                (SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway),
                (SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway),
                (SELECT "Item1->Item1->Item1" AS pathway),
                (SELECT "Item1->Item2->Item2" AS pathway)            
            )
          )
        )
      )
    )
  )
  GROUP BY grp, pathway
  ORDER BY ord
)
GROUP BY pathway