Hive Regex可变长度文件

时间:2018-04-19 03:17:27

标签: regex hive

我有一个文件,它是一个可变长度文件,但我需要能够将其读入并将其加载到临时表中。每个字段的位置在文件中是相同的,但它的结尾可能会因为最后一个字段不是必需的而变化。

长度可以从805个字符到最大值822.对于下面的正则表达式,如果我尝试加载长度为805个字符的记录,它将使整个记录无效。我已经尝试将最后几个字段变长,但它仍然没有加载所有记录。

 drop table dz_1318_disc.asc_monthly_stg;
CREATE EXTERNAL TABLE dz_1318_disc.asc_monthly_stg
(
    CMAFFID String COMMENT '/*@type=varchar(12)*/',
    ADTMBRCHR String COMMENT '/*@type=varchar(97)*/',
    SITENAME String COMMENT '/*@type=varchar(90)*/',
    SRVADD1 String COMMENT '/*@type=varchar(30)*/',
    SRVADD2 String COMMENT '/*@type=varchar(30)*/',
    SRVADD3 String COMMENT '/*@type=varchar(30)*/',
    SRVCITY String COMMENT '/*@type=varchar(30)*/',
    SRVST String COMMENT '/*@type=varchar(3)*/',
    SRVZIP5 String COMMENT '/*@type=varchar(5)*/',
    SRVZIP4 String COMMENT '/*@type=varchar(12)*/',
    BILFRST String COMMENT '/*@type=varchar(50)*/',
    BILLAST String COMMENT '/*@type=varchar(90)*/',
    BILADD1 String COMMENT '/*@type=varchar(30)*/',
    BILADD2 String COMMENT '/*@type=varchar(30)*/',
    BILADD3 String COMMENT '/*@type=varchar(30)*/',
    BILCITY String COMMENT '/*@type=varchar(30)*/',
    BILST String COMMENT '/*@type=varchar(3)*/',
    BILZIP5 String COMMENT '/*@type=varchar(5)*/',
    BILZIP4 String COMMENT '/*@type=varchar(12)*/',
    BILPHON String COMMENT '/*@type=varchar(10)*/',
    SRVPHON String COMMENT '/*@type=varchar(10)*/',
    NEWCUSTNUM String COMMENT '/*@type=varchar(38)*/',
    ACTDATE String COMMENT '/*@type=varchar(8)*/',
    DISDATE String COMMENT '/*@type=varchar(8)*/',
    ACTSTAT String COMMENT '/*@type=varchar(70)*/',
    RECURR String COMMENT '/*@type=varchar(10)*/',
    INSTALL String COMMENT '/*@type=varchar(10)*/',
    LEADSRC String COMMENT '/*@type=varchar(3)*/',
    CAMKGRUP String COMMENT '/*@type=varchar(3)*/',
    DISCCODE String COMMENT '/*@type=varchar(6)*/',
    DISTRIC String COMMENT '/*@type=varchar(9)*/ ',
    COMBINE String COMMENT '/*@type=varchar (18)*)/'
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES 
    (
    "input.regex" = "(.{12})(.{97})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{50})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{10})(.{10})(.{38})(.{8})(.{8})(.{70})(.{10})(.{10})(.{3})(.{3})(.{6})(.{9})(.{9}).*"
    )
STORED AS TEXTFILE
LOCATION '/dz/dz_1318/disc.db/kpp_monthly/';

文件布局如下:

----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2--
xxxx        012345678                                                                                        JOHNSON, JOHN                                                                             1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216             JOHN                                             JOHNSON                                                                                   1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216            1234567890123456789012345678                              20151024201411101                                                                     479.87999926.7812347899   CMV   450      2        123456789
xxxx        012345678                                                                                        JOHNSON, JOHN                                                                             1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216             JOHN                                             JOHNSON                                                                                   1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216            1234567890123456789012345678                              20151024201411101                                                                     479.87999926.7812347899   CMV   450      25       123456789
xxxx        012345678                                                                                        JOHNSON, JOHN                                                                             1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216             JOHN                                             JOHNSON                                                                                   1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216            1234567890123456789012345678                              20151024201411101                                                                     479.87999926.7812347899   CMV   450      258      123456789
xxxx        012345678                                                                                        JOHNSON, JOHN                                                                             1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216             JOHN                                             JOHNSON                                                                                   1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216            1234567890123456789012345678                              20151024201411101                                                                     479.87999926.7812347899   CMV   450      2
xxxx        012345678                                                                                        JOHNSON, JOHN                                                                             1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216             JOHN                                             JOHNSON                                                                                   1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216            1234567890123456789012345678                              20151024201411101                                                                     479.87999926.7812347899   CMV   450      25
xxxx        012345678                                                                                        JOHNSON, JOHN                                                                             1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216             JOHN                                             JOHNSON                                                                                   1234 WEST STREET              APT B                         ATTN JOHN                     SOME CITY                     TX 78216            1234567890123456789012345678                              20151024201411101                                                                     479.87999926.7812347899   CMV   450      258

1 个答案:

答案 0 :(得分:1)

根据您的示例数据,您必须为上一列(.{0,9})指定一个非固定长度,除此之外,如果您想要查看最后一个元素(123456789),还需要包含一个列。独立专栏

CREATE EXTERNAL TABLE dz_1318_disc.asc_monthly_stg
(
    CMAFFID String COMMENT '/*@type=varchar(12)*/',
    ADTMBRCHR String COMMENT '/*@type=varchar(97)*/',
    SITENAME String COMMENT '/*@type=varchar(90)*/',
    SRVADD1 String COMMENT '/*@type=varchar(30)*/',
    SRVADD2 String COMMENT '/*@type=varchar(30)*/',
    SRVADD3 String COMMENT '/*@type=varchar(30)*/',
    SRVCITY String COMMENT '/*@type=varchar(30)*/',
    SRVST String COMMENT '/*@type=varchar(3)*/',
    SRVZIP5 String COMMENT '/*@type=varchar(5)*/',
    SRVZIP4 String COMMENT '/*@type=varchar(12)*/',
    BILFRST String COMMENT '/*@type=varchar(50)*/',
    BILLAST String COMMENT '/*@type=varchar(90)*/',
    BILADD1 String COMMENT '/*@type=varchar(30)*/',
    BILADD2 String COMMENT '/*@type=varchar(30)*/',
    BILADD3 String COMMENT '/*@type=varchar(30)*/',
    BILCITY String COMMENT '/*@type=varchar(30)*/',
    BILST String COMMENT '/*@type=varchar(3)*/',
    BILZIP5 String COMMENT '/*@type=varchar(5)*/',
    BILZIP4 String COMMENT '/*@type=varchar(12)*/',
    BILPHON String COMMENT '/*@type=varchar(10)*/',
    SRVPHON String COMMENT '/*@type=varchar(10)*/',
    NEWCUSTNUM String COMMENT '/*@type=varchar(38)*/',
    ACTDATE String COMMENT '/*@type=varchar(8)*/',
    DISDATE String COMMENT '/*@type=varchar(8)*/',
    ACTSTAT String COMMENT '/*@type=varchar(70)*/',
    RECURR String COMMENT '/*@type=varchar(10)*/',
    INSTALL String COMMENT '/*@type=varchar(10)*/',
    LEADSRC String COMMENT '/*@type=varchar(3)*/',
    CAMKGRUP String COMMENT '/*@type=varchar(3)*/',
    DISCCODE String COMMENT '/*@type=varchar(6)*/',
    DISTRIC String COMMENT '/*@type=varchar(9)*/ ',
    COMBINE String COMMENT '/*@type=varchar (18)*)/',
    YOUREXTRACOLUMN String
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES 
    (
    "input.regex" = "(.{12})(.{97})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{50})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{10})(.{10})(.{38})(.{8})(.{8})(.{70})(.{10})(.{10})(.{3})(.{3})(.{6})(.{9})(.{0,9})(.{0,9}).*"

    )
STORED AS TEXTFILE
;

测试它

select combine, yourextracolumn,  length(combine) from dz_1318_disc.asc_monthly_stg

输出

2        ,123456789,9
25       ,123456789,9
258      ,123456789,9
3,,1
35,,2
358,,3