我正在尝试拆分一些文字。基本上我想分开一级括号,比如"('1','a',NULL),(2,'b')"
=> ["('1','a',NULL)", "(2,'b')]"
,但我需要知道里面可能引用的字符串。它至少需要满足以下py.tests:
from splitter import split_text
def test_normal():
assert split_text("('1'),('2')") == ["('1')", "('2')"]
assert split_text("(1),(2),(3)") == ["(1)", "(2)", "(3)"]
def test_complex():
assert split_text("('1','a'),('2','b')") == ["('1','a')", "('2','b')"]
assert split_text("('1','a',NULL),(2,'b')") == ["('1','a',NULL)", "(2,'b')"]
def test_apostrophe():
assert split_text("('\\'1','a'),('2','b')") == ["('\\'1','a')", "('2','b')"]
def test_coma_in_string():
assert split_text("('1','a,c'),('2','b')") == ["('1','a,c')", "('2','b')"]
def test_bracket_in_string():
assert split_text("('1','a)c'),('2','b')") == ["('1','a)c')", "('2','b')"]
def test_bracket_and_coma_in_string():
assert split_text("('1','a),(c'),('2','b')") == ["('1','a),(c')", "('2','b')"]
def test_bracket_and_coma_in_string_apostrophe():
assert split_text("('1','a\\'),(c'),('2','b')") == ["('1','a\\'),(c')", "('2','b')"]
我尝试了以下内容:
1)正则表达式
这看起来是最好的解决方案,但不幸的是我没有想出满足所有测试的任何东西。
我最好的尝试是:
def split_text(text):
return re.split('(?<=\)),(?=\()', text)
但显然,这相当简单,并且test_bracket_and_coma_in_string
和test_bracket_and_coma_in_string_apostrophe
失败。
2)有限状态机的解决方案
我试图自己编写FSM代码:
OUTSIDE, IN_BRACKETS, IN_STRING, AFTER_BACKSLASH = range(4)
def split_text(text):
state = OUTSIDE
read = []
result = []
for character in text:
if state == OUTSIDE:
if character == ',':
result.append(''.join(read))
read = []
elif character == '(':
read.append(character)
state = IN_BRACKETS
else:
read.append(character)
elif state == IN_BRACKETS:
read.append(character)
if character == ')':
state = OUTSIDE
elif character == "'":
state = IN_STRING
elif state == IN_STRING:
read.append(character)
if character == "'":
state = IN_BRACKETS
elif character == '\\':
state = AFTER_BACKSLASH
elif state == AFTER_BACKSLASH:
read.append(character)
state = IN_STRING
result.append(''.join(read)) # The rest of string
return result
它有效,通过所有测试,但非常慢。
3)pyparsing
from pyparsing import QuotedString, ZeroOrMore, Literal, Group, Suppress, Word, nums
null_value = Literal('NULL')
number_value = Word(nums)
string_value = QuotedString("'", escChar='\\', unquoteResults=False)
value = null_value | number_value | string_value
one_bracket = Group(Literal('(') + value + ZeroOrMore(Literal(',') + value) + Literal(')'))
all_brackets = one_bracket + ZeroOrMore(Suppress(',') + one_bracket)
def split_text(text):
parse_result = all_brackets.parseString(text)
return [''.join(a) for a in parse_result]
同样通过所有测试,但令人惊讶的是它比解决方案#2 甚至更慢。
如何使解决方案快速而强大?我有这种感觉,我错过了一些明显的东西。
答案 0 :(得分:3)
一种方法是使用支持List<Stream<Integer>> lists = Arrays.asList(list1.stream(), list2.stream(), list3.stream());
List<Integer> result = StreamUtils.interleave(Selectors.roundRobin(), lists).collect(Collectors.toList());
System.out.println(result);
功能的较新regex
模块:
(*SKIP)(*FAIL)
分解它说:
import regex as re
def split_text(text):
rx = r"""'.*?(?<!\\)'(*SKIP)(*FAIL)|(?<=\)),(?=\()"""
return re.split(rx, text)
答案 1 :(得分:0)
我做了这个,它适用于给定的测试。
tests = ["('1'),('2')",
"(1),(2),(3)",
"('1','a'),('2','b')",
"('1','a',NULL),(2,'b')",
"('\\'1','a'),('2','b')",
"('1','a,c'),('2','b')",
"('1','a)c'),('2','b')",
"('1','a),(c'),('2','b')",
"('1','a\\'),(c'),('2','b')"]
for text in tests:
tmp = ''
res = []
bracket = 0
quote = False
for idx,i in enumerate(text):
if i=="'":
if text[idx-1]!='\\':
quote = not quote
tmp += i
elif quote:
tmp += i
elif i==',':
if bracket: tmp += i
else: pass
else:
if i=='(': bracket += 1
elif i==')': bracket -= 1
if bracket: tmp += i
else:
tmp += i
res.append(tmp)
tmp = ''
print res
输出:
["('1')", "('2')"]
['(1)', '(2)', '(3)']
["('1','a')", "('2','b')"]
["('1','a',NULL)", "(2,'b')"]
["('\\'1','a')", "('2','b')"]
["('1','a,c')", "('2','b')"]
["('1','a)c')", "('2','b')"]
["('1','a),(c')", "('2','b')"]
["('1','a\\'),(c')", "('2','b')"]
代码有改进的余地,欢迎编辑。 :)
答案 2 :(得分:0)
这是正则表达式,似乎可以工作并通过所有测试。在真实数据上运行它比在Python中实现的有限状态机快6倍。
PATTERN = re.compile(
r"""
\( # Opening bracket
(?:
# String
(?:'(?:
(?:\\')|[^'] # Either escaped apostrophe, or other character
)*'
)
|
# or other literal not containing right bracket
[^')]
)
(?:, # Zero or more of them separated with comma following the first one
# String
(?:'(?:
(?:\\')|[^'] # Either escaped apostrophe, or other character
)*'
)
|
# or other literal
[^')]
)*
\) # Closing bracket
""",
re.VERBOSE)
def split_text(text):
return PATTERN.findall(text)