我有如下数据-
Year,winning_country
2001,IND
2002,IND
2003,IND
2004,AUS
2005,AUS
2006,SA
2007,SA
2008,SA
2009,IND
2010,IND
2011,IND
2012,IND
2013,AUS
2014,AUS
2015,SA
2016,NZ
2017,SL
2018,IND
这里的问题是找出每个国家最长的连胜纪录,期望的产出将如下所示-
Country,no_of_wins
IND,4
AUS,2
SA,3
SL,1
NZ,1
有人可以在这里帮忙吗
答案 0 :(得分:1)
这是一个空白问题,但是最简单的方法是从年份中减去序列。因此,要获取所有序列:
select country, count(*) as streak,
min(year) as from_year, max(year) as to_year
from (select year, country,
row_number() over (partition by country order by year) as seqnum
from t
) t
group by country, (year - seqnum);
要获得每个国家/地区最长的时间,请重新汇总或使用窗口函数:
select country, streak
from (select country, count(*) as streak,
min(year) as from_year, max(year) as to_year,
row_number() over (partition by country order by count(*) desc) as seqnum_2
from (select year, country,
row_number() over (partition by country order by year) as seqnum
from t
) t
group by country, (year - seqnum)
) cy
where seqnum_2 = 1;
我更喜欢使用row_number()
获得最长的连胜纪录,因为它还可以让您获得出现的年份。
答案 1 :(得分:0)
如果Redshift支持分析功能,则查询如下。
with t1 as
(
select 2001 as year,'IND' as cntry from dual union
select 2002,'IND' from dual union
select 2003,'IND' from dual union
select 2004,'AUS' from dual union
select 2005,'AUS' from dual union
select 2006,'SA' from dual union
select 2007,'SA' from dual union
select 2008,'SA' from dual union
select 2009,'IND' from dual union
select 2010,'IND' from dual union
select 2011,'IND' from dual union
select 2012,'IND' from dual union
select 2013,'AUS' from dual union
select 2014,'AUS' from dual union
select 2015,'SA' from dual union
select 2016,'NZ' from dual union
select 2017,'SL' from dual union
select 2018,'IND' from dual) ,
t2 as (select year, cntry, year - row_number() over (partition by cntry order by year) as grpBy from t1 order by cntry),
t3 as (select cntry, count(grpBy) as consWins from t2 group by cntry, grpBy),
res as (select cntry, consWins, row_number() over (partition by cntry order by consWins desc) as rnk from t3)
select cntry, consWins from res where rnk=1;
希望这会有所帮助。
答案 2 :(得分:0)
这是一种利用Redshift Python UDF的解决方案
可能有更简单的方法来实现相同目的,但这是如何创建简单UDF的一个很好的例子。
create table temp_c (competition_year int ,winning_country varchar(4));
insert into temp_c (competition_year, winning_country)
values
(2001,'IND'),
(2002,'IND'),
(2003,'IND'),
(2004,'AUS'),
(2005,'AUS'),
(2006,'SA'),
(2007,'SA'),
(2008,'SA'),
(2009,'IND'),
(2010,'IND'),
(2011,'IND'),
(2012,'IND'),
(2013,'AUS'),
(2014,'AUS'),
(2015,'SA'),
(2016,'NZ'),
(2017,'SL'),
(2018,'IND')
;
create or replace function find_longest_streak(InputStr varChar)
returns integer
stable
as $$
MaxStreak=0
ThisStreak=0
ThisYearStr=''
LastYear=0
for ThisYearStr in InputStr.split(','):
if int(ThisYearStr) == LastYear + 1:
ThisStreak+=1
else:
if ThisStreak > MaxStreak:
MaxStreak=ThisStreak
ThisStreak=1
LastYear=int(ThisYearStr)
return max(MaxStreak,1)
$$ language plpythonu;
select winning_country,
find_longest_streak(listagg(competition_year,',') within group (order by competition_year))
from temp_c
group by winning_country
order by 2 desc
;
答案 3 :(得分:0)
看起来像是一个空白与岛屿的问题。
下面的SQL根据2 row_number计算一些排名。
那只是分组的问题。
SELECT q2.Country, MAX(q2.no_of_wins) AS no_of_wins
FROM
(
SELECT q1.winning_country as Country,
COUNT(*) AS no_of_wins
FROM
(
SELECT t.Year, t.winning_country,
(ROW_NUMBER() OVER (ORDER BY t.Year ASC) -
ROW_NUMBER() OVER (PARTITION BY t.winning_country ORDER BY t.Year)) AS rnk
FROM yourtable t
) q1
GROUP BY q1.winning_country, q1.rnk
) q2
GROUP BY q2.Country
ORDER BY MAX(q2.no_of_wins) DESC
答案 4 :(得分:0)
怎么样……
SELECT
winning_country,
COUNT(*)
GROUP BY winning_country
HAVING MAX(year) - MIN(year) = COUNT(year) - 1
这假定没有重复的条目。
答案 5 :(得分:0)
创建会话抽象就可以了:
WITH winning_changes AS (
SELECT *,
CASE WHEN LAG(winning_country) OVER (ORDER BY year) <> winning_country THEN 1 ELSE 0 END AS same_winner
FROM winners
),
sequences AS (
SELECT *,
SUM(same_winner) OVER (ORDER BY year) AS winning_session
FROM winning_changes
),
streaks AS (
SELECT winning_country AS country,
winning_session,
COUNT(*) streak
FROM sequences
GROUP BY 1,2
)
SELECT country,
MAX(streak) AS no_of_wins
FROM streaks
GROUP BY 1;