我在TEXT类型的PostgreSQL列中有数据,我需要进行一些字符替换。具体来说,我想用花括号替换方括号。问题是,如果包括主要的封闭括号,我只想更换不超过两层的括号。这些字符串可能很长,所以我认为正则表达式可能是要走的路(regexp_replace
函数),但我对正则表达式不好。以下是一个这样的值的示例:
[0,0,0,[12,2],0,0,[12,[1,2,3]],12,0,[12,2,[2]],12,0,12,0,0]
所以我希望这个字符串改为:
{0,0,0,{12,2},0,0,{12,[1,2,3]},12,0,{12,2,[2]},12,0,12,0,0}
提前致谢!
答案 0 :(得分:3)
这将是一个痛苦的正则表达式为in PostgreSQL flavor possibly no recursion is available。
对于最多2级嵌套深度检查,如果以下双重替换有效(无法测试)
regexp_replace(
regexp_replace('str', E'\\[(([^][]|\\[([^][]|\\[[^][]*\\])*\\])*)\\]', E'{\\1}', 'g')
, E'\\[(([^][]|\\[([^][]|\\[[^][]*\\])*\\])*)\\]', E'{\\1}', 'g')
这个想法是在两遍中匹配并替换最外面的[]
。
请参阅regex101上的示例:
pass 1:{0,0,0,[12,2],0,0,[12,[1,2,3]],12,0,[12,2,[2]],12,0,12,0,0}
pass 2:{0,0,0,{12,2},0,0,{12,[1,2,3]},12,0,{12,2,[2]},12,0,12,0,0}
\[[^][]*\]
(未转义)与[...]
\[
打开方括号[^][]*
后跟任意数量的字符,不是方括号\]
后跟一个结束方括号注意,如果字符串始终以[
开头,则以]
结尾并表示0级的一个实例(不由][
分隔)第一个/内部{{1也可以通过regexp_replace
开头[
替换^
端的]
$
E'^\\[(.*)\\]$'
在此处添加嵌套最多4个深度级别的示例:
E'{\\1}'
将外部\[([^][]| # outer
\[([^][]| # lvl 1
\[([^][]| # lvl 2
\[([^][]| # lvl 3
\[[^][]*\] # lvl 4
)*\]
)*\]
)*\]
)*\]
内的内容包装成capture group 4个等级的模式将成为:
[]
与\[(([^][]|\[([^][]|\[([^][]|\[([^][]|\[[^][]*\])*\])*\])*\])*)\]
一起使用可能需要额外转义regex_replace
[]
这可以像两遍中的第一个模式一样使用,并替换为\\[(([^][]|\\[([^][]|\\[([^][]|\\[([^][]|\\[[^][]*\\])*\\])*\\])*\\])*)\\]
答案 1 :(得分:3)
这很难看,但它有效(并且避免了正则表达式的复杂性;-)我希望我能够涵盖所有的角落案例......
CREATE OR REPLACE FUNCTION replbracket( _source text ) returns text
AS $func$
DECLARE
pos_end INTEGER;
pos_begin INTEGER;
level INTEGER;
result text;
BEGIN
result = '' ;
level = 0;
LOOP
pos_begin = position ( '[' IN _source );
pos_end = position ( ']' IN _source );
-- raise notice 'Source=% Result=% Begin = % End=%'
-- ,_source, result, pos_begin, pos_end;
if (pos_begin < 1 AND pos_end < 1) THEN EXIT ;
elsif (pos_begin < 1 ) THEN pos_begin = pos_end + 1 ;
elsif (pos_end < 1 ) THEN pos_end = pos_begin + 1 ;
end if;
if (pos_begin < pos_end) THEN
result = result || LEFT(_source, pos_begin-1);
level = level + 1;
if (level <= 2) THEN result = result || '{'; else result = result || '['; end if;
_source = SUBSTR(_source, pos_begin+1);
ELSE
result = result || LEFT(_source, pos_end-1);
level = level - 1;
if (level < 2) THEN result = result || '}'; else result = result || ']'; end if;
_source = SUBSTR(_source, pos_end+1);
END IF;
END LOOP;
result = result || _source ;
return result;
END
$func$ LANGUAGE plpgsql;
答案 2 :(得分:3)
仅仅是为了解决问题,这里完全是SQL的解决方案。它使用CTE来表示符号清晰度,但您可以在FROM中使用子查询,而不使用递归CTE。
编辑:在Pl / Python中添加简化的,更快的SQL版本,在C中添加版本.C one速度更快 - 大约快250倍。
create or replace function repl(text)
returns text
language sql
as $$
with
chars(pos, ch) as (
-- In PostgreSQL 9.4 this can be replaced with an UNNEST ... WITH ORDINALITY
-- it turns the string into a list of chars accompanied by their position within
-- the string.
select row_number() OVER (), ch
from regexp_split_to_table($1,'') ch
),
nesting(ch, pos, lvl) as (
-- This query then determines how many levels of nesting of [s and ]s are
-- in effect for each character.
select ch, pos,
sum(case ch when '[' then 1 when ']' then -1 else 0 end) OVER (ORDER BY pos)
from chars
),
transformed(ch, pos) as (
-- and this query transforms [s to {s or ]s to }s if the nesting
-- level is appropriate. Note that we use one less level of nesting
-- for closing brackets because the closing bracket it self has already
-- reduced the nesting level.
select
case
when ch = '[' and lvl <= 2 then '{'
when ch = ']' and lvl <= 1 then '}'
else ch
end,
pos
from nesting
)
-- Finally, reconstruct the new string from the (char, position) tuples
select
string_agg(ch, '' order by pos)
from transformed;
$$;
然而,它比其他解决方案慢。
replbracket
需要950毫秒进行10,000次迭代。 摆脱CTE并使用unnest ... with ordinality
将其加速到大约1400毫秒:
create or replace function repl(text) returns text language sql volatile as
$$
select
string_agg(ch, '' order by pos)
from (
select
case
when ch = '[' and sum(case ch when '[' then 1 when ']' then -1 else 0 end) OVER (ORDER BY pos) <= 2 then '{'
when ch = ']' and sum(case ch when '[' then 1 when ']' then -1 else 0 end) OVER (ORDER BY pos) <= 1 then '}'
else ch
end,
pos
from unnest(regexp_split_to_array($1,'')) with ordinality as chars(ch, pos)
) as transformed(ch, pos)
$$;
如果你想要快速,请使用适当的程序语言 - 或C.在PL / Python2中:
create or replace function replpy(instr text) returns text language plpythonu as $$
def pyrepl(instr):
level=0
for ch in instr:
if ch == '[':
level += 1
if level <= 2:
yield '{'
else:
yield '['
elif ch == ']':
if level <= 2:
yield '}'
else:
yield ']'
level -= 1
else:
yield ch
return ''.join(pyrepl(instr))
$$;
需要160毫秒。
好的,鞭打死马,让我们用C. Full source code as an extension is here来做,但这里是.c文件:
#include "postgres.h"
#include "fmgr.h"
#include "utils/builtins.h"
PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(replc);
Datum replc(PG_FUNCTION_ARGS);
PGDLLEXPORT Datum
replc(PG_FUNCTION_ARGS)
{
/* Set `buf` to a palloc'd copy of the input string, deTOASTed if needed */
char * const buf = text_to_cstring(PG_GETARG_TEXT_PP(0));
char * ch = buf;
int depth = 0;
while (*ch != '\0')
{
switch (*ch)
{
case '[':
depth++;
if (depth <= 2)
*ch = '{';
break;
case ']':
if (depth <= 2)
*ch = '}';
depth--;
break;
}
ch++;
}
if (depth != 0)
ereport(WARNING,
(errmsg("Opening and closing []s did not match, got %d extra [s", depth)));
PG_RETURN_DATUM(CStringGetTextDatum(buf));
}
运行时:10,000次迭代8ms。足够好,它比原始速度快250倍,这是强制子查询的开销。