打印Penn Tree Bank格式的平衡括号

时间:2015-02-05 02:39:03

标签: python recursion tree nlp

我试图打印Penn Tree Bank风格的解析,我似乎无法让parens正确平衡(令人沮丧 - 解析器工作正常,但我无法获得输出打印正确!)有关递归函数的任何提示或技巧吗?

这是我当前的打印方法,我调用以解析的start节点开头的函数。

def print_tree(current_node, parents, side):
    print(parents)
    for i in range(parents):
        print("     ", " ", end="")
    if current_node.is_terminal:
        if side == 'left':
            print("(" + current_node.lhs, current_node.word_label + " )")
        else: 
            print("(" + current_node.lhs, current_node.word_label +")", "", end ="")
            for i in range(parents):
                print(")", "", end="")
            print()
    else:
        print("( " +current_node.lhs)
    if current_node.left_child != None:
        print_tree(current_node.left_child, parents+1, 'left')
    if current_node.right_child != None:
        print_tree(current_node.right_child, parents+1, 'right')

我得到的结果:

( TOP
              ( S_VP
                     (VB 'List' )
                     ( NP
                            ( NP
                                   (DT 'the' )
                                   (NNS 'flights') ) ) ) ) ) 
                            ( PP
                                   ( PP
                                          (IN 'from' )
                                          (NP_NNP 'Baltimore') ) ) ) ) ) ) 
                                   ( PP
                                          (TO 'to' )
                                          ( NP
                                                 (NP_NNP 'Seattle' )
                                                 ( NP
                                                        ( NP
                                                               (DT 'that' )
                                                               (NN 'stop') ) ) ) ) ) ) ) ) ) 
                                                        ( PP
                                                               (IN 'in' )
                                                               (NP_NNP 'Minneapolis') ) ) ) ) ) ) ) ) ) 
              (PUNC '.') ) ) 

期望的结果:

( TOP
              ( S_VP
                     (VB 'List' )
                     ( NP
                            ( NP
                                   (DT 'the' )
                                   (NNS 'flights') )
                            ( PP
                                   ( PP
                                          (IN 'from' )
                                          (NP_NNP 'Baltimore')  )
                                   ( PP
                                          (TO 'to' )
                                          ( NP
                                                 (NP_NNP 'Seattle' ))
                                                 ( NP
                                                        ( NP
                                                               (DT 'that' )
                                                               (NN 'stop') )
                                                        ( PP
                                                               (IN 'in' )
                                                               (NP_NNP 'Minneapolis')))))
              (PUNC '.') ) ))

我试图想办法把它作为递归/缩进次数的函数来做,但是它并没有取得多大的成功。

1 个答案:

答案 0 :(得分:0)

在每个终端节点(叶子)上,您正在为所有父母打印);你应该只对连续的右侧父母这样做。

我建议将parents重命名为depth并添加right_depth参数。

编辑:在玩了一下之后,我决定将它委托给树更好:

class Node:
    INDENT = "    "

    __slots__ = ["lhs", "word_label", "left_child", "right_child"]

    def __init__(self, lhs, *args):
        self.lhs = lhs
        num_args = len(args)
        if num_args == 1:
            self.word_label  = args[0]
            self.left_child  = None
            self.right_child = None
        elif num_args == 2:
            self.word_label  = None
            self.left_child  = args[0]
            self.right_child = args[1]
        else:
            raise ValueError("should have one arg (word_label: str) or two args (left: Node and right: Node)")

    def is_terminal(self):
        return self.word_label is not None

    def tree_str(self, depth=0, indent=None):
        if indent is None:
            indent = self.INDENT
        if self.is_terminal():
            return "\n{}({} '{}' )".format(
                indent * depth,
                self.lhs,
                self.word_label
            )
        else:
            return "\n{}( {}{}{} )".format(
                indent * depth,
                self.lhs,
                self.left_child .tree_str(depth + 1, indent),
                self.right_child.tree_str(depth + 1, indent)
            )

    def __str__(self):
        return self.tree_str()

然后是一些句法助手,

def make_leaf_type(name):
    def fn(x):
        return Node(name, x)
    fn.__name__ = name
    return fn

for leaf_type in ("VB", "DT", "NNS", "IN", "NP_NNP", "TO", "NN", "PUNC"):
    locals()[leaf_type] = make_leaf_type(leaf_type)

def make_node_type(name):
    def fn(l, r):
        return Node(name, l, r)
    fn.__name__ = name
    return fn

for node_type in ("TOP", "S_VP", "NP", "PP"):
    locals()[node_type] = make_node_type(node_type)

所以我可以创建树,

tree = \
    TOP(
        S_VP(
            VB('List'),
            NP(
                NP(
                    DT('the'),
                    NNS('flights')
                ),
                PP(
                    PP(
                        IN('from'),
                        NP_NNP('Baltimore')
                    ),
                    PP(
                        TO('to'),
                        NP(
                            NP_NNP('Seattle'),
                            NP(
                                NP(
                                    DT('that'),
                                    NN('stop')
                                ),
                                PP(
                                    IN('in'),
                                    NP_NNP('Minneapolis')
                                )
                            )
                        )
                    )
                )
            )
        ),
        PUNC('.')
    )

然后打印像

>>> print(tree)
( TOP
    ( S_VP
        (VB 'List' )
        ( NP
            ( NP
                (DT 'the' )
                (NNS 'flights' ) )
            ( PP
                ( PP
                    (IN 'from' )
                    (NP_NNP 'Baltimore' ) )
                ( PP
                    (TO 'to' )
                    ( NP
                        (NP_NNP 'Seattle' )
                        ( NP
                            ( NP
                                (DT 'that' )
                                (NN 'stop' ) )
                            ( PP
                                (IN 'in' )
                                (NP_NNP 'Minneapolis' ) ) ) ) ) ) ) )
    (PUNC '.' ) )

我认为实际上是所需要的。