如何在所有行中搜索文本,而无需单独指定每个列

时间:2017-03-15 09:34:35

标签: sql hadoop hive apache-spark-sql hiveql

E.g。

根据以下表格和数据,找到包含单词“on”的行(不区分大小写)

create table t (i int,dt date,s1 string,s2 string,s3 string)
;

insert into t

select  inline
        (
            array
            (
                struct(1,date '2017-03-15','Now we take our time','so nonchalant','And spend our nights so bon vivant')
               ,struct(2,date '2017-03-16','Quick as a wink','She changed her mind','She stood on the tracks')
               ,struct(3,date '2017-03-17','But I’m talking a Greyhound','On the Hudson River Line','I’m in a New York state of mind')
            )
        ) 
;


select * from t
;

+-----+------------+-----------------------------+--------------------------+------------------------------------+
| t.i |    t.dt    |            t.s1             |           t.s2           |                t.s3                |
+-----+------------+-----------------------------+--------------------------+------------------------------------+
|   1 | 2017-03-15 | Now we take our time        | so nonchalant            | And spend our nights so bon vivant |
|   2 | 2017-03-16 | Quick as a wink             | She changed her mind     | She stood on the tracks            |
|   3 | 2017-03-17 | But I’m talking a Greyhound | On the Hudson River Line | I’m in a New York state of mind    |
+-----+------------+-----------------------------+--------------------------+------------------------------------+            

1 个答案:

答案 0 :(得分:1)

简单(但有限)的解决方案

此解决方案仅与包含“原始”类型的表相关 (没有结构,数组,地图等)。

该解决方案的问题在于所有列在没有分隔符的情况下连接(不,concat_ws(*)产生异常),因此边界中的单词变为单个单词,例如 -
GreyhoundOn成为GreyhoundOn

select  i
       ,regexp_replace(concat(*),'(?i)on','==>$0<==') as rec

from    t

where   concat(*) rlike '(?i)on'    
;
+---+-----------------------------------------------------------------------------------------------------------+
|   |                                                    rec                                                    |
+---+-----------------------------------------------------------------------------------------------------------+
| 1 | 12017-03-15Now we take our timeso n==>on<==chalantAnd spend our nights so b==>on<== vivant                |
| 2 | 22017-03-16Quick as a winkShe changed her mindShe stood ==>on<== the tracks                               |
| 3 | 32017-03-17But I’m talking a Greyhound==>On<== the Huds==>on<== River LineI’m in a New York state of mind |
+---+-----------------------------------------------------------------------------------------------------------+

复杂(但敏捷)的解决方案

此解决方案仅与包含“原始”类型的表相关 (没有结构,数组,地图等)。

我把信封推到了这里但是成功地生成了一个包含所有列的分隔字符串 现在可以查找整个单词。

(?ix) http://www.regular-expressions.info/modifiers.html

select  i
       ,regexp_replace(concat(*),'(?ix)\\b on \\b','==>$0<==') as delim_rec

from   (select  i
               ,printf(concat('%s',repeat('|||%s',field(unhex(1),*,unhex(1))-2)),*)   as delim_rec   

        from    t
        ) t

where  delim_rec rlike '(?ix)\\b on \\b'        
;
+---+------------------------------------------------------------------------------------------------------------------+
| i |                                                    delim_rec                                                     |
+---+------------------------------------------------------------------------------------------------------------------+
| 2 | 22|||2017-03-16|||Quick as a wink|||She changed her mind|||She stood ==>on<== the tracks                         |
| 3 | 33|||2017-03-17|||But I’m talking a Greyhound|||==>On<== the Hudson River Line|||I’m in a New York state of mind |
+---+------------------------------------------------------------------------------------------------------------------+

使用其他外部表

create external table t_ext (rec string) 
row format delimited
fields terminated by '0'
location '/user/hive/warehouse/t'   
;
select  cast(split(rec,'\\x01')[0] as int)                                              as i
       ,regexp_replace(regexp_replace(rec,'(?ix)\\b on \\b','==>$0<=='),'\\x01','|||')  as rec

from    t_ext

where   rec rlike '(?ix)\\b on \\b'               
;
+---+-----------------------------------------------------------------------------------------------------------------+
| i |                                                       rec                                                       |
+---+-----------------------------------------------------------------------------------------------------------------+
| 2 | 2|||2017-03-16|||Quick as a wink|||She changed her mind|||She stood ==>on<== the tracks                         |
| 3 | 3|||2017-03-17|||But I’m talking a Greyhound|||==>On<== the Hudson River Line|||I’m in a New York state of mind |
+---+-----------------------------------------------------------------------------------------------------------------+