我正在尝试提取.txt
文件中嵌套括号(以及括号本身)中的所有字符串。请参阅示例here中使用的示例.txt
文件。
我已经尝试并完成了三个不同的代码,但是它们似乎都无法提取所有嵌套的括号。他们只能提取嵌套括号的一部分。关于我做错了什么的任何建议都可以真正帮助您!
这是我到目前为止完成的三个代码:
import re
from os.path import join
def balanced_braces(args):
parts = []
for arg in args:
if '(' not in arg:
continue
chars = []
n = 0
for c in arg:
if c == '(':
if n > 0:
chars.append(c)
n += 1
elif c == ')':
n -= 1
if n > 0:
chars.append(c)
elif n == 0:
parts.append(''.join(chars).lstrip().rstrip())
chars = []
elif n > 0:
chars.append(c)
return parts
with open('lan sample text file.txt','r') as fd:
#for words in fd.readlines():
t1 = balanced_braces(fd);
print(t1)
输出:
['"xE\'", PUT(xx.xxxx.),"\'"', '"TRUuuuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.", '"xE\'", PUT(xx.xxxx.),"\'"', '"CUuuiiiiuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
from pyparsing import nestedExpr
matchedParens = nestedExpr('(',')')
with open('lan sample text file.txt','r') as fd:
for words in fd.readlines():
for e in matchedParens.searchString(words):
print(e)
输出:
[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"TRUuuuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"CUuuiiiiuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
def parse_segments(source, recurse=False):
unmatched_count = 0
start_pos = 0
opened = False
open_pos = 0
cur_pos = 0
finished = []
segments = []
for character in source:
#scan for mismatched parenthesis:
if character == '(':
unmatched_count += 1
if not opened:
open_pos = cur_pos
opened = True
if character == ')':
unmatched_count -= 1
if opened and unmatched_count == 0:
segment = source[open_pos:cur_pos+1]
segments.append(segment)
clean = source[start_pos:open_pos]
if clean:
finished.append(clean)
opened = False
start_pos = cur_pos+1
cur_pos += 1
# assert unmatched_count == 0
if start_pos != cur_pos:
#get anything that was left over here
finished.append(source[start_pos:cur_pos])
#now check on recursion:
for item in segments:
#get rid of bounding parentheses:
pruned = item[1:-1]
if recurse:
results = parse_tags(pruned, recurse)
finished.expand(results)
else:
finished.append(pruned)
return finished
with open('lan sample text file.txt','r') as fd:
for words in fd.readlines():
t = parse_segments(words)
print(t)
输出:
['kkkkk;\n']
['\n']
[' select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"TRUuuuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(\n']
['SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
[' ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );\n']
['\n']
['\n']
['jjjjjj;\n']
['\n']
[' select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"CUuuiiiiuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
[' ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );']
我无法获得的预期输出应如下所示:
("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
)
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
)
实施DarrylG的代码:
def parse(text):
result = []
parens_open = 0
for char in text:
if char == '(':
parens_open += 1
result.append(char)
elif char == ')' and parens_open:
parens_open -= 1
result.append(char)
elif char == '\n' and result and result[-1] != '\n':
result.append(char)
elif parens_open:
result.append(char)
return ''.join(result)
checkhere = set()
checkhere.add("Select")
checkhere.add("From")
checkhere.add("select")
checkhere.add("from")
checkhere.add("SELECT")
checkhere.add("FROM")
with open('lan sample text file.txt', 'r') as fd:
txt = fd.read()
result = parse(txt)
for chunk in parse(result):
for x in checkhere:
if x in chunk:
print(chunk)
答案 0 :(得分:0)
以下代码输出与您的原始意图相同
def parse(text):
result = []
parens_open = 0
for char in text:
if char == '(':
parens_open += 1
result.append(char)
elif char == ')' and parens_open:
if parens_open == 1 and result[-1] == '(':
result.pop() # Removes empty unnested parens i.e. '()'
else:
result.append(char)
parens_open -= 1
elif char == '\n' and result and result[-1] != '\n':
# ensure only one carriage return between text
result.append(char)
elif parens_open:
result.append(char)
return ''.join(result)
with open('test.txt', 'r') as fd:
txt = fd.read()
result = parse(txt)
print(result)
输出
("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
)
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
)
此代码可获取有关检索特定字段的更新问题。
def find_field(field, text):
pattern = re.compile('\(\s*' + field, flags=re.IGNORECASE)
matches = pattern.finditer(text)
result = []
for m in matches:
s, e = m.span()
parens_open = 0
if result:
result.append('\n(' + field)
else:
result.append('(' + field)
for char in text[e+1:]: # skip field
if char == '(':
parens_open += 1
result.append(char)
elif char == ')' and parens_open:
if parens_open == 1 and result[-1] == '(':
result.pop() # Removes empty parens
else:
result.append(char)
parens_open -= 1
if parens_open == 0:
break # end of field's enclosing left, right parens
elif char == '\n' and result and result[-1] != '\n':
result.append(char)
elif parens_open:
result.append(char)
return ''.join(result)
# Test by retrieving select field
with open('test.txt', 'r') as fd:
txt = fd.read()
print(find_field("SELECT", txt))
输出
(SELECT((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
(SELECT
((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))