在impala中查询结构数组

时间:2019-06-26 11:47:31

标签: hive impala complextype

我有一个配置单元表,其中包含复杂类型,即字符串数组。 我试图从黑斑羚查询它并将其提取到数组中的列,但当我这样做时,它返回数组的笛卡尔联接结果 您可能会看到代码并在您的身边创建示例

-- I adapted from this example https://www.cloudera.com/documentation/enterprise/5-5-x/topics/impala_complex_types.html#complex_types_ex_hive_etl

-- impala side

CREATE TABLE flat_struct_array (continent STRING, country STRING, city STRING);

INSERT INTO flat_struct_array VALUES
    ('North America', 'Canada', 'Toronto') , ('North America', 'Canada', 'Vancouver')
  , ('North America', 'Canada', "St. John\'s") , ('North America', 'Canada', 'Saint John')
  , ('North America', 'Canada', 'Montreal') , ('North America', 'Canada', 'Halifax')
  , ('North America', 'Canada', 'Winnipeg') , ('North America', 'Canada', 'Calgary')
  , ('North America', 'Canada', 'Saskatoon') , ('North America', 'Canada', 'Ottawa')
  , ('North America', 'Canada', 'Yellowknife') , ('Europe', 'France', 'Paris')
  , ('Europe', 'France', 'Nice') , ('Europe', 'France', 'Marseilles')
  , ('Europe', 'France', 'Cannes') , ('Europe', 'Greece', 'Athens')
  , ('Europe', 'Greece', 'Piraeus') , ('Europe', 'Greece', 'Hania')
  , ('Europe', 'Greece', 'Heraklion') , ('Europe', 'Greece', 'Rethymnon')
  , ('Europe', 'Greece', 'Fira');

CREATE TABLE complex_struct_array2
(continent STRING, country array< STRUCT <name: STRING, city: ARRAY <string> >   > ) STORED AS PARQUET;

-- hive side

INSERT INTO complex_struct_array2
select continent, collect_list(struct1)
from (
SELECT continent, named_struct('name', country, 'city', collect_list(city)) as struct1 FROM flat_struct_array GROUP BY continent, country
) a group by continent

select * from complex_struct_array2
-- you'll see table with 2 records, and in Europe it has 2 countries
-- France has 4 cities and Greece has 6 cities

--back to impala side

select * 
from complex_struct_array2 t, t.country t2, t.country.city t3

-- you'll see the result that France contains 10 records (with Greece cities) and vice versa
continent     |name   |item        |
--------------|-------|------------|
Europe        |France |Paris       |
Europe        |France |Nice        |
Europe        |France |Marseilles  |
Europe        |France |Cannes      |
Europe        |France |Athens      |<-- should not be shown
Europe        |France |Piraeus     |<-- should not be shown
Europe        |France |Hania       |<-- should not be shown
Europe        |France |Heraklion   |<-- should not be shown
Europe        |France |Rethymnon   |<-- should not be shown
Europe        |France |Fira        |<-- should not be shown
...

我希望结果是法国仅包含4条记录,并且仅显示我们在Hive方面看到的法国城市

我在Hive上尝试过此方法,它返回了正确的结果

select aa.continent, aa.country_name, cityinfo.*
from (
select a.continent, countryinfo.name as country_name, countryinfo.city  as cityarray
from  complex_struct_array2 a 
lateral view inline(a.country) countryinfo 
) aa
lateral view explode(aa.cityarray) cityinfo

continent     |country_name |col         |
--------------|-------------|------------|
Europe        |France       |Paris       |
Europe        |France       |Nice        |
Europe        |France       |Marseilles  |
Europe        |France       |Cannes      |

0 个答案:

没有答案