递归依赖项解析和特征提取

时间:2019-02-01 09:22:10

标签: python parsing dependencies feature-extraction

我正在研究语言学,在准备用于机器学习的数据时遇到了麻烦。 这里我有一句话:

The Tibetan dialect off the province of Purik forms a link between Baltī and Ladakhī .

当我通过依赖解析器时;它以字典的形式给了我依赖树:

{(3, 'ROOT', 'S'): [{(1, 'det', 'L'): []}, {(2, 'compound', 'L'): []},{(6, 'nmod', 'R'): [{(4, 'case', 'L'): []}, {(5, 'det', 'L'): []}, {(9, 'nmod', 'R'): [{(7, 'case', 'L'): []}, {(8, 'amod', 'L'): []}, {(11, 'dep', 'R'): [{(10, 'det', 'L'): []}, {(13, 'nmod', 'R'): [{(12, 'case', 'L'): []}, {(14, 'cc', 'R'): []}, {(15, 'conj', 'R'): []}]}]}]}]}, {(16, 'punct', 'R'): []}]}

遍历字典时,我可以从字典中提取所有节点。我得到以下结果:

(3, 'ROOT', 'S')
(1, 'det', 'L')
(2, 'compound', 'L')
(6, 'nmod', 'R')
(4, 'case', 'L')
(5, 'det', 'L')
(9, 'nmod', 'R')
(7, 'case', 'L')
(8, 'amod', 'L')
(11, 'dep', 'R')
(10, 'det', 'L')
(13, 'nmod', 'R')
(12, 'case', 'L')
(14, 'cc', 'R')
(15, 'conj', 'R')
(16, 'punct', 'R')

这里的问题是我失去了根源。哪个节点是该节点的孩子。我真正需要做的是跟踪根节点,以便提取具有子节点的节点,并将其视为根节点。并将句子的该部分传递给依赖项解析器以获取新的根(首字)。

从此字典中,我可以获取作为元组的字典对象,但我的依赖项解析器需要字符串中的参数。

这是我完整的代码:

import json
import os
import xml.etree.ElementTree as ET
import itertools

from stanfordcorenlp import StanfordCoreNLP


def feature_extraction():
    fes = []
    CORE_NLP_DIR = '/Users/ahmed/Desktop/Pycharm/stanford-library'
    PARSER = StanfordCoreNLP(CORE_NLP_DIR, memory='8g', lang='en')
    props = {'annotators': 'tokenize,pos,lemma,depparse', 'tokenize.whitespace': 'True', 'ssplit.isOneSentence': 'True'}

    for f in os.listdir('/Users/ahmed/Desktop/Pycharm/test/fulltext/fulltext/')[0:1]:
        root = ET.parse('/Users/ahmed/Desktop/Pycharm/test/fulltext/fulltext/' + f).getroot()
        for sent in root.iter('{http://framenet.icsi.berkeley.edu}sentence'):
            sent_text = sent.find('{http://framenet.icsi.berkeley.edu}text').text
            annotations = sent.findall('{http://framenet.icsi.berkeley.edu}annotationSet')
            sent_annotation_info = []
            for annotation in annotations:
                layers = annotation.findall('{http://framenet.icsi.berkeley.edu}layer')
                annotation_info = []
                for layer in layers:
                    target = ''
                    if layer.get('name') == 'fe':
                        labels = layer.findall('{http://framenet.icsi.berkeley.edu}label')
                        fes = [(label.get('name'), label.get('start'), label.get('end'),
                                sent_text[int(label.get('start')):int(label.get('end')) + 1]) for label in labels]
                    elif layer.get('name') == 'target':
                        label = layer.find('{http://framenet.icsi.berkeley.edu}label')
                        target = (label.get('name'), label.get('start'), label.get('end'),
                                  sent_text[int(label.get('start')):int(label.get('end')) + 1])
                    if fes and target:
                        annotation_info.append((target, fes))
                    # print(annotation_info)
                sent_annotation_info.append(annotation_info)
            if sent_text != '\n':
                parse = json.loads(PARSER.annotate(sent_text, properties=props))

                dependencies = get_dependencies(parse['sentences'][0]['basicDependencies'])
                parsed_tree = parse_tree(dependencies)

                extract_features(sent_annotation_info, parsed_tree, parse, sent_text)
                handle_tree(parsed_tree)
                print("- =" * 10)
                # print(list(powerset(node_list)))
    PARSER.close()


def extract_features(annotations, tree, parse, sent_text):
    if annotations:
        for annotation in annotations:
            # print(annotation)
            for ((t, s, e, w), fe) in annotation:
                for token in parse['sentences'][0]['tokens']:
                    # print(token)
                    # print(token['characterOffsetBegin'],token['characterOffsetEnd'],s,e)
                    # print(s, e)
                    if token['characterOffsetBegin'] == int(s) and token['characterOffsetEnd'] == int(e) + 1:
                        # print(token)
                        print('#' * 25)
                        print(sent_text)
                        print(tree)
                        print('- ' * 10)


def node_feature_extraction(data):
    fes = []
    CORE_NLP_DIR = '/Users/ahmed/Desktop/Pycharm/stanford-library'
    PARSER = StanfordCoreNLP(CORE_NLP_DIR, memory='8g', lang='en')
    props = {'annotators': 'tokenize,pos,lemma,depparse', 'tokenize.whitespace': 'True', 'ssplit.isOneSentence': 'True'}

    root = ET.parse(data).getroot()
    for sent in root.iter('{http://framenet.icsi.berkeley.edu}sentence'):
        sent_text = sent.find('{http://framenet.icsi.berkeley.edu}text').text
        annotations = sent.findall('{http://framenet.icsi.berkeley.edu}annotationSet')
        sent_annotation_info = []
        for annotation in annotations:
            layers = annotation.findall('{http://framenet.icsi.berkeley.edu}layer')
            annotation_info = []
            for layer in layers:
                target = ''
                if layer.get('name') == 'fe':
                    labels = layer.findall('{http://framenet.icsi.berkeley.edu}label')
                    fes = [(label.get('name'), label.get('start'), label.get('end'),
                            sent_text[int(label.get('start')):int(label.get('end')) + 1]) for label in labels]
                elif layer.get('name') == 'target':
                    label = layer.find('{http://framenet.icsi.berkeley.edu}label')
                    target = (label.get('name'), label.get('start'), label.get('end'),
                              sent_text[int(label.get('start')):int(label.get('end')) + 1])
                if fes and target:
                    annotation_info.append((target, fes))
                # print(annotation_info)
            sent_annotation_info.append(annotation_info)
        if sent_text != '\n':
            parse = json.loads(PARSER.annotate(sent_text, properties=props))

            dependencies = get_dependencies(parse['sentences'][0]['basicDependencies'])
            parsed_tree = parse_tree(dependencies)

            extract_features(sent_annotation_info, parsed_tree, parse, sent_text)
            handle_tree(parsed_tree)
            print("- =" * 10)
    PARSER.close()


def get_dependencies(dependencies):
    stanford_dep_sent = {}
    for dependency_info in dependencies:
        relation_name = dependency_info['dep']
        gov_index = int(dependency_info['governor'])
        stanford_dep_sent[dependency_info['dependent']] = (relation_name, gov_index)
    return stanford_dep_sent


def parse_tree(dependencies):
    for d in dependencies:
        rel, g = dependencies[d]
        if rel == 'ROOT':
            return build_node(d, 'ROOT', 'S', dependencies)


def find_dependents2(h, dependencies):
    dependents = []
    for d in dependencies:
        (rel, g) = dependencies[d]
        if g == h:
            dependents.append((d, rel))
    return dependents


def build_node(n, rel, p, dependencies):
    global pp
    dependents = find_dependents2(n, dependencies)
    if not dependents:
        return {(n, rel, p): []}
    else:
        children = []
        for (d, rell) in dependents:
            if d > n:
                pp = 'R'
            elif d < n:
                pp = 'L'
            children.append(build_node(d, rell, pp, dependencies))
        return {(n, rel, p): children}


def handle_tree(tree):
    for node in tree:
        handle_node(node)
        for child in tree[node]:
            # print(child)
            # print("child.....")
            handle_tree(child)
            # # node_feature_extraction(child)


def handle_node(node):
    print(node)


def powerset(iterable):
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))


if __name__ == "__main__":
    feature_extraction()
    # dependency_tree_parse()

0 个答案:

没有答案