获取嵌套括号中的所有文本(Python)

时间:2019-12-03 00:38:21

标签: python python-3.x nested parentheses

我正在尝试提取.txt文件中嵌套括号(以及括号本身)中的所有字符串。请参阅示例here中使用的示例.txt文件。

我已经尝试并完成了三个不同的代码,但是它们似乎都无法提取所有嵌套的括号。他们只能提取嵌套括号的一部分。关于我做错了什么的任何建议都可以真正帮助您!

这是我到目前为止完成的三个代码:

  • 第一次尝试:
import re
from os.path import join

def balanced_braces(args):
    parts = []
    for arg in args:
        if '(' not in arg:
            continue
        chars = []
        n = 0
        for c in arg:
            if c == '(':
                if n > 0:
                    chars.append(c)
                n += 1
            elif c == ')':
                n -= 1
                if n > 0:
                    chars.append(c)
                elif n == 0:
                    parts.append(''.join(chars).lstrip().rstrip())
                    chars = []
            elif n > 0:
                chars.append(c)
    return parts

with open('lan sample text file.txt','r') as fd:
    #for words in fd.readlines():       
    t1 = balanced_braces(fd);
    print(t1)

输出:

['"xE\'", PUT(xx.xxxx.),"\'"', '"TRUuuuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.", '"xE\'", PUT(xx.xxxx.),"\'"', '"CUuuiiiiuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
  • 第二次尝试:
from pyparsing import nestedExpr

matchedParens = nestedExpr('(',')')
with open('lan sample text file.txt','r') as fd:
    for words in fd.readlines():
        for e in matchedParens.searchString(words):
            print(e)

输出:

[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"TRUuuuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"CUuuiiiiuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
  • 第三次尝试:
def parse_segments(source, recurse=False):

    unmatched_count = 0
    start_pos = 0
    opened = False
    open_pos = 0
    cur_pos = 0

    finished = []
    segments = []

    for character in source:
        #scan for mismatched parenthesis:
        if character == '(':
            unmatched_count += 1
            if not opened:
                open_pos = cur_pos
            opened = True

        if character == ')':
            unmatched_count -= 1

        if opened and unmatched_count == 0:
            segment = source[open_pos:cur_pos+1]
            segments.append(segment)
            clean = source[start_pos:open_pos]
            if clean:
                finished.append(clean)
            opened = False
            start_pos = cur_pos+1

        cur_pos += 1

   # assert unmatched_count == 0

    if start_pos != cur_pos:
        #get anything that was left over here
        finished.append(source[start_pos:cur_pos])

    #now check on recursion:
    for item in segments:
        #get rid of bounding parentheses:
        pruned = item[1:-1]
        if recurse:
            results = parse_tags(pruned, recurse)
            finished.expand(results)
        else:
            finished.append(pruned)

    return finished

with open('lan sample text file.txt','r') as fd:
    for words in fd.readlines():
        t = parse_segments(words)
        print(t)

输出:

['kkkkk;\n']
['\n']
['  select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"TRUuuuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(\n']
['SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
['      ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );\n']
['\n']
['\n']
['jjjjjj;\n']
['\n']
['  select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"CUuuiiiiuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
['      ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );']

我无法获得的预期输出应如下所示:

("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
 )
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
 )

实施DarrylG的代码:

def parse(text):
    result = []
    parens_open = 0

    for char in text:
        if char == '(':
            parens_open += 1
            result.append(char)
        elif char == ')' and parens_open:
            parens_open -= 1
            result.append(char)
        elif char == '\n' and result and result[-1] != '\n':
            result.append(char)
        elif parens_open:
            result.append(char)

    return ''.join(result)


checkhere = set()               
checkhere.add("Select")
checkhere.add("From")
checkhere.add("select")
checkhere.add("from")
checkhere.add("SELECT")
checkhere.add("FROM")


with open('lan sample text file.txt', 'r') as fd:
    txt = fd.read()
    result = parse(txt)
    for chunk in parse(result):
        for x in checkhere:
            if x in chunk:
                print(chunk)    

1 个答案:

答案 0 :(得分:0)

以下代码输出与您的原始意图相同

def parse(text):
  result = []
  parens_open = 0

  for char in text:
    if char == '(':
      parens_open += 1
      result.append(char)
    elif char == ')' and parens_open:
      if parens_open == 1 and result[-1] == '(':
        result.pop()  # Removes empty unnested parens i.e. '()'
      else:
        result.append(char)
      parens_open -= 1

    elif char == '\n' and result and result[-1] != '\n':
      # ensure only one carriage return between text
      result.append(char)
    elif parens_open:
      result.append(char)

  return ''.join(result)

with open('test.txt', 'r') as fd:
  txt = fd.read()
  result = parse(txt)
  print(result)

输出

("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
 )
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
    FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
 )

此代码可获取有关检索特定字段的更新问题。

def find_field(field, text):
  pattern = re.compile('\(\s*' + field, flags=re.IGNORECASE)
  matches = pattern.finditer(text)

  result = []
  for m in matches:
    s, e = m.span()
    parens_open = 0
    if result:
      result.append('\n(' + field)
    else:
      result.append('(' + field)

    for char in text[e+1:]: # skip field
      if char == '(':
        parens_open += 1
        result.append(char)
      elif char == ')' and parens_open:
        if parens_open == 1 and result[-1] == '(':
          result.pop()  # Removes empty parens
        else:
          result.append(char)
        parens_open -= 1
        if parens_open == 0:
          break         # end of field's enclosing left, right parens
      elif char == '\n' and result and result[-1] != '\n':
        result.append(char)
      elif parens_open:
        result.append(char)

  return ''.join(result)

# Test by retrieving select field
with open('test.txt', 'r') as fd:
  txt = fd.read()
  print(find_field("SELECT", txt))

输出

(SELECT((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
(SELECT
((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
      (xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))