我有一个配置单元表,其中包含复杂类型,即字符串数组。 我试图从黑斑羚查询它并将其提取到数组中的列,但当我这样做时,它返回数组的笛卡尔联接结果 您可能会看到代码并在您的身边创建示例
-- I adapted from this example https://www.cloudera.com/documentation/enterprise/5-5-x/topics/impala_complex_types.html#complex_types_ex_hive_etl
-- impala side
CREATE TABLE flat_struct_array (continent STRING, country STRING, city STRING);
INSERT INTO flat_struct_array VALUES
('North America', 'Canada', 'Toronto') , ('North America', 'Canada', 'Vancouver')
, ('North America', 'Canada', "St. John\'s") , ('North America', 'Canada', 'Saint John')
, ('North America', 'Canada', 'Montreal') , ('North America', 'Canada', 'Halifax')
, ('North America', 'Canada', 'Winnipeg') , ('North America', 'Canada', 'Calgary')
, ('North America', 'Canada', 'Saskatoon') , ('North America', 'Canada', 'Ottawa')
, ('North America', 'Canada', 'Yellowknife') , ('Europe', 'France', 'Paris')
, ('Europe', 'France', 'Nice') , ('Europe', 'France', 'Marseilles')
, ('Europe', 'France', 'Cannes') , ('Europe', 'Greece', 'Athens')
, ('Europe', 'Greece', 'Piraeus') , ('Europe', 'Greece', 'Hania')
, ('Europe', 'Greece', 'Heraklion') , ('Europe', 'Greece', 'Rethymnon')
, ('Europe', 'Greece', 'Fira');
CREATE TABLE complex_struct_array2
(continent STRING, country array< STRUCT <name: STRING, city: ARRAY <string> > > ) STORED AS PARQUET;
-- hive side
INSERT INTO complex_struct_array2
select continent, collect_list(struct1)
from (
SELECT continent, named_struct('name', country, 'city', collect_list(city)) as struct1 FROM flat_struct_array GROUP BY continent, country
) a group by continent
select * from complex_struct_array2
-- you'll see table with 2 records, and in Europe it has 2 countries
-- France has 4 cities and Greece has 6 cities
--back to impala side
select *
from complex_struct_array2 t, t.country t2, t.country.city t3
-- you'll see the result that France contains 10 records (with Greece cities) and vice versa
continent |name |item |
--------------|-------|------------|
Europe |France |Paris |
Europe |France |Nice |
Europe |France |Marseilles |
Europe |France |Cannes |
Europe |France |Athens |<-- should not be shown
Europe |France |Piraeus |<-- should not be shown
Europe |France |Hania |<-- should not be shown
Europe |France |Heraklion |<-- should not be shown
Europe |France |Rethymnon |<-- should not be shown
Europe |France |Fira |<-- should not be shown
...
我希望结果是法国仅包含4条记录,并且仅显示我们在Hive方面看到的法国城市
我在Hive上尝试过此方法,它返回了正确的结果
select aa.continent, aa.country_name, cityinfo.*
from (
select a.continent, countryinfo.name as country_name, countryinfo.city as cityarray
from complex_struct_array2 a
lateral view inline(a.country) countryinfo
) aa
lateral view explode(aa.cityarray) cityinfo
continent |country_name |col |
--------------|-------------|------------|
Europe |France |Paris |
Europe |France |Nice |
Europe |France |Marseilles |
Europe |France |Cannes |