我在postgres表中有以下表格中的数据:
Col1 Col2 Col3 Col4
id1 a b c
id2 id1 timeBegin 1###-##-##
id2 id1 timeEnd 22##-##-##
id3 id4 id5 id6
id6 id3 timeBegin 2##-##-##
id7 id3 timeEnd 200-3-##
id13 id8 id14 id15
id8 id9 timeBegin -2-1-1
id10 id11 id12 id13
此处1 ### - ## - ##表示时间不确定(1000-01-01至1999-12-31)
和22 ## - ## - ##意味着时间的不确定性(2200-01-01至2200-12-31)
和2 ## - ## - ##暗示从(200-01-01到200-12-31)的时间不确定
和200-3 - ##意味着从(200-3-01到200-3-31)的时间不确定
和20-3 - ##意味着从(20-3-01到20-3-31)的时间不确定性
和200-3 - ##意味着从(200-3-01到200-3-31)的时间不确定性
和-200-3 - ##意味着从(-200-3-31到-200-3-01)的时间不确定性
现在我想将col1 == col2的3行合并为以下形式之一:
Col1 Col2 Col3 Col4 timeBegin timeEnd
id1 a b c 1000-01-01 2200-12-31
id3 id4 id5 id6 200-01-02 200-3-31
id10 id11 id12 id13 NULL NULL
id13 id8 id14 id15 2-1-1 BC 9999-12-12
如果未给出col1 == col2的timeEnd,则将9999-12-12假定为timeEnd
如果没有给出col1 == col2的timeBegin,则01-01-01被假定为timeBegin
即。我想在合并时采用最少的timeBegin和最大的timeEnd。
是否可以在postgres中执行此连接操作。即我可以将其写为SQL连接查询吗?
如果我能够使用像python这样的编程语言(以有效的方式)实现所需的目标:那么这将是很棒的。
答案 0 :(得分:1)
我不相信这是最好的方法,但是这里有一对Postgres函数可以很好地将模式转换为最小和最大日期:
<强> Example Fiddle 强>
Create Function preprocessPattern(pat varchar(11), out cpat varchar(10), out neg boolean) as $$
declare
y varchar(4);
m varchar(2);
d varchar(2);
i int;
begin
neg = false;
if left(pat, 1) = '-' then
neg = true;
pat = right(pat, -1);
end if;
i = position('-' in pat);
y = right('000' || left(pat, i - 1), 4);
pat = right(pat, -i);
i = position('-' in pat);
m = right('0' || left(pat, i - 1), 2);
pat = right(pat, -i);
d = right('0' || pat, 2);
cpat = y || '-' || m || '-' || d;
end;
$$ Language plpgsql;
Create Function dateFromFmt(fmt varchar(10), neg boolean) returns date as $$
begin
if neg then
return to_date(fmt || ' BC', 'yyyy-mm-dd BC');
else
return to_date(fmt, 'yyyy-mm-dd');
end if;
end;
$$ Language plpgsql;
Create Function minDateFromPattern(pat varchar(11)) returns date as $$
declare
i int;
neg boolean;
n varchar(10);
begin
select * into pat, neg from preprocessPattern(pat);
i = position('#' in pat);
if i = 0 then
return dateFromFmt(pat, neg);
else
n = left(pat, i - 1) || right('0000-00-00', 0 - position('#' in pat) + 1);
n = replace(n, '-00', '-01');
return dateFromFmt(n, neg);
end if;
end;
$$ Language plpgsql;
Create Function maxDateFromPattern(pat varchar(11)) returns date as $$
declare
i int;
y int;
m int;
d int;
x varchar(10);
neg boolean;
res date;
begin
select * into pat, neg from preprocessPattern(pat);
i = position('#' in pat);
if i = 0 then
return dateFromFmt(pat, neg);
elsif i = 1 then
return date '9999-12-31';
-- from here down, pick the next highest mask, convert to min date then subtract one day
elsif i <= 6 then -- just add 1 to year
if i = 6 then i = 5; end if; -- skip - char
x = cast(cast(left(pat, i - 1) as int) + 1 as varchar) || right(pat, 0 - i + 1);
else
y = cast(left(pat, 4) as int);
if i = 7 then
m = cast(substr(pat, 6, 1) as int) + 1;
if m = 2 then
m = 0;
y = y + 1;
end if;
x = left(to_char(y, 'FM0000'), 4) || '-' || to_char(m, 'FM0') || '#-##';
elsif i = 9 then
m = cast(substr(pat, 6, 2) as int) + 1;
if m > 12 then
m = 1;
y = y + 1;
end if;
x = left(to_char(y, 'FM0000'), 4) || '-' || to_char(m, 'FM00') || '-##';
elseif i = 10 then
m = cast(substr(pat, 6, 2) as int);
d = cast(substr(pat, 9, 1) as int) + 1;
if (m = 2 and d = 3) or d = 4 then
m = m + 1;
d = 0;
if m > 12 then
m = 1;
y = y + 1;
end if;
end if;
x = left(to_char(y, 'FM0000'), 4) || '-' || to_char(m, 'FM00') || '-' || to_char(d, 'FM0') || '#';
end if;
end if;
-- the original logic looks a little silly now as we're preprocessing twice
res = minDateFromPattern(x) - interval '1 day';
if neg then
return dateFromFmt(to_char(res, 'yyyy-mm-dd'), neg);
else
return res;
end if;
end;
$$ Language plpgsql;
这已经经历了足够的迭代,它可以使用一些重构
答案 1 :(得分:0)
问题中实际上有两个部分。一个对应于正确对齐表格的数据。另一个是处理日期格式的无数混乱。
现在假设有两个sql函数begin_time()和end_time()。我在下面讨论它们。
要对齐数据,请将表连接两次:
select t.col1, t.col2, t.col3, t.col4,
parse_begin_time(bt.col4) as timeBegin,
parse_end_time(et.col4) as timeEnd
from yourtable t
left join yourtable as bt on begin_t.col2 = t.col1 and bt.col3 = 'timeBegin'
left join yourtable as et on end_t.col2 = t.col1 and et.col3 = 'timeEnd'
where t.col3 not in ('timeBegin', 'timeEnd');
如果由于存在多个条目而产生多个条目,请使用聚合:
select t.col1, t.col2, t.col3, t.col4,
min(parse_begin_time(bt.col4)) as timeBegin,
max(parse_end_time(et.col4)) as timeEnd
from yourtable t
left join yourtable as bt on begin_t.col2 = t.col1 and bt.col3 = 'timeBegin'
left join yourtable as et on end_t.col2 = t.col1 and et.col3 = 'timeEnd'
where t.col3 not in ('timeBegin', 'timeEnd')
group by t.col1, t.col2, t.col3, t.col4;
注意:如果你有大量数据,预计上述情况不会特别好。在create table中将它们作为...语句运行一次并删除原始模式,或创建物化视图以供将来使用。
然后你需要担心格式化凌乱的timeBegin和timeEnd字段,我假设它们存储在文本字段中。它会是这样的:
create or replace function parse_begin_time(text) returns date as $$
declare
_input text := $1;
_output text;
_bc boolean := false;
_y text;
_m text;
_d text;
_tmp text;
_i int;
begin
_input := trim(both from _input);
-- PG is fine with '200-01-01 BC' as a date, but not with '-200-01-01'
if left(_input, 1) = '-'
then
_bc := true;
_input := right(_input, -1);
end if;
-- Extract year, month and day
_tmp := _input;
_i := position(_tmp for '-');
_y := substring(_tmp from 1 for i - 1);
_tmp := substring(_tmp from i);
_i := position(_tmp for '-');
_m := substring(_tmp from 1 for i - 1);
_tmp := substring(_tmp from i);
_i := position(_tmp for '-');
_d := substring(_tmp from 1 for i - 1);
if _tmp <> '' or left(trim(left '0' from _y), 1) = 'X'
then
raise exception 'invalid date input: %', _input;
end if;
-- Prevent locale-specific text to date conversion issues with one or two digit years
-- e.g. rewrite 1-2-3 as 0001-02-03.
if length(_y) < 4
then
_y := lpad(_y, 4, '0');
end if;
if length(_m) < 2
then
_m := lpad(_m, 2, '0');
end if;
if length(_d) < 2
then
_d := lpad(_m, 2, '0');
end if;
-- Process year, month, day
-- Add suitable logic here per your specs, using string and date functions
-- http://www.postgresql.org/docs/current/static/functions-string.html
-- http://www.postgresql.org/docs/current/static/functions-formatting.html
-- http://www.postgresql.org/docs/current/static/functions-datetime.html
-- for end-of-months, use the built-in arithmetics, e.g.:
-- _date := _date + interval '1 month' - interval '1 day'
-- Build _output
_output := _y || '-' || _m || '-' || _d;
if _bc
_output := _output || ' BC';
end if;
return _output::date;
end;
$$ language plpgsql strict stable;
如果你对它更熟悉,那么语言也可以是plpython或plpythonu。我认为你比我更了解这两个,当然还有足够的python来编写所需的代码。劳伦斯的代码是另一个很好的起点,如果你宁愿在plpgsql中使用。
strict
语句告诉Postgres不要在null输入上调用函数并立即返回null。你可能不希望它用于end_time函数。
答案 2 :(得分:0)
下面的方法使用带有SELECT
语句的单个SQL CASE
,通过后处理子查询来应用规则。可能会进一步调整,但它提供了一般的想法。对复杂性表示道歉 - 这只是基于帖子中的规则而开始时相当简单,但已被修改以处理一般情况,结果证明非常繁琐!
已知限制:此方法目前不能很好地处理闰年,并且假设2月的最后一天始终是第28天,则安全方面的错误。这可能是固定的,但是闰年计算并不是完全无关紧要的,所以我已经将这一点遗漏掉,以免过于复杂。
SELECT Col1, Col2, Col3, Col4,
CASE WHEN timeBegin IS NULL AND timeEnd IS NOT NULL
THEN '01-01-01'
WHEN timeBegin LIKE '-%' -- Handle negative (= BC) dates separately
THEN CASE WHEN SUBSTRING(timeBegin, 2) LIKE '%-[0-1]#'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 2) || '9 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-1-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-31 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-2-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-28 BC' -- No leap year calculation
WHEN SUBSTRING(timeBegin, 2) LIKE '%-3-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-31 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-4-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-30 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-5-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-31 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-6-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-30 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-7-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-31 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-8-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-31 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-9-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-30 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-10-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-31 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-11-##'
THEN SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 4) || '-30 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-0#-##'
THEN REPLACE(SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 7), '#', '9') || '-09-30 BC'
WHEN SUBSTRING(timeBegin, 2) LIKE '%-12-##' OR SUBSTRING(timeBegin, 2) LIKE '%-_#-##'
THEN REPLACE(SUBSTRING(timeBegin, 2, CHAR_LENGTH(timeBegin) - 7), '#', '9') || '-12-31 BC'
ELSE REPLACE(SUBSTRING(timeBegin, 2), '#', '9')
END
ELSE REPLACE(REPLACE(REPLACE(timeBegin, '-0#', '-01'), '-##', '-01'), '#', '0')
END AS timeBegin,
CASE WHEN timeEnd IS NULL AND timeBegin IS NOT NULL
THEN '9999-12-12'
WHEN timeEnd LIKE '-%' -- Handle negative (= BC) dates separately
THEN REPLACE(REPLACE(REPLACE(SUBSTRING(timeEnd, 2), '-0#', '-01'), '-##', '-01'), '#', '0') || ' BC'
WHEN timeEnd LIKE '%-[0-1]#'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 1) || '9'
WHEN timeEnd LIKE '%-1-3#' OR timeEnd LIKE '%-1-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '31'
WHEN timeEnd LIKE '%-2-2#' OR timeEnd LIKE '%-2-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2)|| '28' -- No leap year calculation
WHEN timeEnd LIKE '%-3-3#' OR timeEnd LIKE '%-3-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2)|| '31'
WHEN timeEnd LIKE '%-4-3#' OR timeEnd LIKE '%-4-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '30'
WHEN timeEnd LIKE '%-5-3#' OR timeEnd LIKE '%-5-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '31'
WHEN timeEnd LIKE '%-6-3#' OR timeEnd LIKE '%-6-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '30'
WHEN timeEnd LIKE '%-7-3#' OR timeEnd LIKE '%-7-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '31'
WHEN timeEnd LIKE '%-8-3#' OR timeEnd LIKE '%-8-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '31'
WHEN timeEnd LIKE '%-9-3#' OR timeEnd LIKE '%-9-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '30'
WHEN timeEnd LIKE '%-10-3#' OR timeEnd LIKE '%-10-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '31'
WHEN timeEnd LIKE '%-11-3#' OR timeEnd LIKE '%-11-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 2) || '30'
WHEN timeEnd LIKE '%-0#-##'
THEN SUBSTRING(timeEnd, 1, CHAR_LENGTH(timeEnd) - 5) || '09-30'
WHEN timeEnd LIKE '%-1#-##' OR timeEnd LIKE '%-##-##'
THEN SUBSTRING(REPLACE(timeEnd, '#', '9'), 1, CHAR_LENGTH(timeEnd) - 5) || '12-31'
ELSE REPLACE(timeEnd, '#', '9')
END AS timeEnd
FROM
(SELECT t1.*,
(SELECT Col4 FROM Tbl t2 WHERE (t1.Col1 = t2.Col2 OR t1.Col2 = t2.Col1)
AND t2.Col3 = 'timeBegin') AS timeBegin,
(SELECT Col4 FROM Tbl t2 WHERE (t1.Col1 = t2.Col2 OR t1.Col2 = t2.Col1)
AND t2.Col3 = 'timeEnd') AS timeEnd
FROM Tbl t1
WHERE t1.Col3 NOT IN ('timeBegin', 'timeEnd')) subquery
ORDER BY CAST(SUBSTRING(Col1, 3) AS INT)
这是一个SQL Fiddle Demo,它显示了它与发布的示例产生类似的结果。