我有一个如下所示的signle文件,它包含从4个不同文件合并到源系统的单个文件中的数据。
NEWFILE =是数据的分隔符。例如,NEWFILE = STUDENT行之后和NEWFILE = SUBJECT行之后的所有数据都属于STUDENT文件。 问题是我们没有任何模式来分隔每个文件的记录。 此外,源系统无法将文件分成4个文件。
我需要加载这个单个输入文件,并根据记录的标题分隔记录。
我所做的是将数据加载到Hive表中并尝试使用ROW_NUMBER&随机功能。
我想过使用ROW_NUMBER函数来识别每个标题的行,然后过滤标题行之间的记录,但是ROW_NUMBER函数输出与文件的实际行顺序不同。因此,可以将属于STUDENT的行分配给SUBJECT。
我无法使用随机函数,因为它也没有给出实际的行号
文件内容数据如下所示
NEWFILE=STUDENT
100 XYZ
101 ABC
102 DEF
NEWFILE=SUBJECT
1 ENGLISH
2 MATHS
NEWFILE=TEACHERS
110 AAAAAAAA
111 BBBBBBB
222 CCCCCCC
333 DDDDDD
NEWFILE=CLASSES
1 CLASS-1
2 CLASS-2
请告知我如何实现所需的输出。
答案 0 :(得分:0)
create external table myfile (rec string)
row format delimited
fields terminated by ','
tblproperties ('serialization.last.column.takes.rest'='true')
;
select rec
,ifn
,ifn_newfile_seq
,row_number () over
(
partition by ifn_newfile_seq
order by boif
) as ifn_newfile_rec_seq
from (select rec
,input__file__name as ifn
,block__offset__inside__file as boif
,count(case when rec like 'NEWFILE=%' then 1 end) over
(
partition by input__file__name
order by block__offset__inside__file
) as ifn_newfile_seq
from myfile
) l
;
+------------------+----------------------------------------------+-----------------+---------------------+
| rec | ifn | ifn_newfile_seq | ifn_newfile_rec_seq |
+------------------+----------------------------------------------+-----------------+---------------------+
| NEWFILE=STUDENT | file:/home/cloudera/local_db/myfile/file.txt | 1 | 1 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 100 XYZ | file:/home/cloudera/local_db/myfile/file.txt | 1 | 2 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 101 ABC | file:/home/cloudera/local_db/myfile/file.txt | 1 | 3 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 102 DEF | file:/home/cloudera/local_db/myfile/file.txt | 1 | 4 |
+------------------+----------------------------------------------+-----------------+---------------------+
| NEWFILE=SUBJECT | file:/home/cloudera/local_db/myfile/file.txt | 2 | 1 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 1 ENGLISH | file:/home/cloudera/local_db/myfile/file.txt | 2 | 2 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 2 MATHS | file:/home/cloudera/local_db/myfile/file.txt | 2 | 3 |
+------------------+----------------------------------------------+-----------------+---------------------+
| NEWFILE=TEACHERS | file:/home/cloudera/local_db/myfile/file.txt | 3 | 1 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 110 AAAAAAAA | file:/home/cloudera/local_db/myfile/file.txt | 3 | 2 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 111 BBBBBBB | file:/home/cloudera/local_db/myfile/file.txt | 3 | 3 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 222 CCCCCCC | file:/home/cloudera/local_db/myfile/file.txt | 3 | 4 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 333 DDDDDD | file:/home/cloudera/local_db/myfile/file.txt | 3 | 5 |
+------------------+----------------------------------------------+-----------------+---------------------+
| NEWFILE=CLASSES | file:/home/cloudera/local_db/myfile/file.txt | 4 | 1 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 1 CLASS-1 | file:/home/cloudera/local_db/myfile/file.txt | 4 | 2 |
+------------------+----------------------------------------------+-----------------+---------------------+
| 2 CLASS-2 | file:/home/cloudera/local_db/myfile/file.txt | 4 | 3 |
+------------------+----------------------------------------------+-----------------+---------------------+