我有一个文件,它是一个可变长度文件,但我需要能够将其读入并将其加载到临时表中。每个字段的位置在文件中是相同的,但它的结尾可能会因为最后一个字段不是必需的而变化。
长度可以从805个字符到最大值822.对于下面的正则表达式,如果我尝试加载长度为805个字符的记录,它将使整个记录无效。我已经尝试将最后几个字段变长,但它仍然没有加载所有记录。
drop table dz_1318_disc.asc_monthly_stg;
CREATE EXTERNAL TABLE dz_1318_disc.asc_monthly_stg
(
CMAFFID String COMMENT '/*@type=varchar(12)*/',
ADTMBRCHR String COMMENT '/*@type=varchar(97)*/',
SITENAME String COMMENT '/*@type=varchar(90)*/',
SRVADD1 String COMMENT '/*@type=varchar(30)*/',
SRVADD2 String COMMENT '/*@type=varchar(30)*/',
SRVADD3 String COMMENT '/*@type=varchar(30)*/',
SRVCITY String COMMENT '/*@type=varchar(30)*/',
SRVST String COMMENT '/*@type=varchar(3)*/',
SRVZIP5 String COMMENT '/*@type=varchar(5)*/',
SRVZIP4 String COMMENT '/*@type=varchar(12)*/',
BILFRST String COMMENT '/*@type=varchar(50)*/',
BILLAST String COMMENT '/*@type=varchar(90)*/',
BILADD1 String COMMENT '/*@type=varchar(30)*/',
BILADD2 String COMMENT '/*@type=varchar(30)*/',
BILADD3 String COMMENT '/*@type=varchar(30)*/',
BILCITY String COMMENT '/*@type=varchar(30)*/',
BILST String COMMENT '/*@type=varchar(3)*/',
BILZIP5 String COMMENT '/*@type=varchar(5)*/',
BILZIP4 String COMMENT '/*@type=varchar(12)*/',
BILPHON String COMMENT '/*@type=varchar(10)*/',
SRVPHON String COMMENT '/*@type=varchar(10)*/',
NEWCUSTNUM String COMMENT '/*@type=varchar(38)*/',
ACTDATE String COMMENT '/*@type=varchar(8)*/',
DISDATE String COMMENT '/*@type=varchar(8)*/',
ACTSTAT String COMMENT '/*@type=varchar(70)*/',
RECURR String COMMENT '/*@type=varchar(10)*/',
INSTALL String COMMENT '/*@type=varchar(10)*/',
LEADSRC String COMMENT '/*@type=varchar(3)*/',
CAMKGRUP String COMMENT '/*@type=varchar(3)*/',
DISCCODE String COMMENT '/*@type=varchar(6)*/',
DISTRIC String COMMENT '/*@type=varchar(9)*/ ',
COMBINE String COMMENT '/*@type=varchar (18)*)/'
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES
(
"input.regex" = "(.{12})(.{97})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{50})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{10})(.{10})(.{38})(.{8})(.{8})(.{70})(.{10})(.{10})(.{3})(.{3})(.{6})(.{9})(.{9}).*"
)
STORED AS TEXTFILE
LOCATION '/dz/dz_1318/disc.db/kpp_monthly/';
文件布局如下:
----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2--
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 2 123456789
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 25 123456789
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 258 123456789
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 2
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 25
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 258
答案 0 :(得分:1)
根据您的示例数据,您必须为上一列(.{0,9})
指定一个非固定长度,除此之外,如果您想要查看最后一个元素(123456789),还需要包含一个列。独立专栏
CREATE EXTERNAL TABLE dz_1318_disc.asc_monthly_stg
(
CMAFFID String COMMENT '/*@type=varchar(12)*/',
ADTMBRCHR String COMMENT '/*@type=varchar(97)*/',
SITENAME String COMMENT '/*@type=varchar(90)*/',
SRVADD1 String COMMENT '/*@type=varchar(30)*/',
SRVADD2 String COMMENT '/*@type=varchar(30)*/',
SRVADD3 String COMMENT '/*@type=varchar(30)*/',
SRVCITY String COMMENT '/*@type=varchar(30)*/',
SRVST String COMMENT '/*@type=varchar(3)*/',
SRVZIP5 String COMMENT '/*@type=varchar(5)*/',
SRVZIP4 String COMMENT '/*@type=varchar(12)*/',
BILFRST String COMMENT '/*@type=varchar(50)*/',
BILLAST String COMMENT '/*@type=varchar(90)*/',
BILADD1 String COMMENT '/*@type=varchar(30)*/',
BILADD2 String COMMENT '/*@type=varchar(30)*/',
BILADD3 String COMMENT '/*@type=varchar(30)*/',
BILCITY String COMMENT '/*@type=varchar(30)*/',
BILST String COMMENT '/*@type=varchar(3)*/',
BILZIP5 String COMMENT '/*@type=varchar(5)*/',
BILZIP4 String COMMENT '/*@type=varchar(12)*/',
BILPHON String COMMENT '/*@type=varchar(10)*/',
SRVPHON String COMMENT '/*@type=varchar(10)*/',
NEWCUSTNUM String COMMENT '/*@type=varchar(38)*/',
ACTDATE String COMMENT '/*@type=varchar(8)*/',
DISDATE String COMMENT '/*@type=varchar(8)*/',
ACTSTAT String COMMENT '/*@type=varchar(70)*/',
RECURR String COMMENT '/*@type=varchar(10)*/',
INSTALL String COMMENT '/*@type=varchar(10)*/',
LEADSRC String COMMENT '/*@type=varchar(3)*/',
CAMKGRUP String COMMENT '/*@type=varchar(3)*/',
DISCCODE String COMMENT '/*@type=varchar(6)*/',
DISTRIC String COMMENT '/*@type=varchar(9)*/ ',
COMBINE String COMMENT '/*@type=varchar (18)*)/',
YOUREXTRACOLUMN String
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES
(
"input.regex" = "(.{12})(.{97})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{50})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{10})(.{10})(.{38})(.{8})(.{8})(.{70})(.{10})(.{10})(.{3})(.{3})(.{6})(.{9})(.{0,9})(.{0,9}).*"
)
STORED AS TEXTFILE
;
测试它
select combine, yourextracolumn, length(combine) from dz_1318_disc.asc_monthly_stg
输出
2 ,123456789,9
25 ,123456789,9
258 ,123456789,9
3,,1
35,,2
358,,3