我正在研究语言学,在准备用于机器学习的数据时遇到了麻烦。 这里我有一句话:
The Tibetan dialect off the province of Purik forms a link between Baltī and Ladakhī .
当我通过依赖解析器时;它以字典的形式给了我依赖树:
{(3, 'ROOT', 'S'): [{(1, 'det', 'L'): []}, {(2, 'compound', 'L'): []},{(6, 'nmod', 'R'): [{(4, 'case', 'L'): []}, {(5, 'det', 'L'): []}, {(9, 'nmod', 'R'): [{(7, 'case', 'L'): []}, {(8, 'amod', 'L'): []}, {(11, 'dep', 'R'): [{(10, 'det', 'L'): []}, {(13, 'nmod', 'R'): [{(12, 'case', 'L'): []}, {(14, 'cc', 'R'): []}, {(15, 'conj', 'R'): []}]}]}]}]}, {(16, 'punct', 'R'): []}]}
遍历字典时,我可以从字典中提取所有节点。我得到以下结果:
(3, 'ROOT', 'S')
(1, 'det', 'L')
(2, 'compound', 'L')
(6, 'nmod', 'R')
(4, 'case', 'L')
(5, 'det', 'L')
(9, 'nmod', 'R')
(7, 'case', 'L')
(8, 'amod', 'L')
(11, 'dep', 'R')
(10, 'det', 'L')
(13, 'nmod', 'R')
(12, 'case', 'L')
(14, 'cc', 'R')
(15, 'conj', 'R')
(16, 'punct', 'R')
这里的问题是我失去了根源。哪个节点是该节点的孩子。我真正需要做的是跟踪根节点,以便提取具有子节点的节点,并将其视为根节点。并将句子的该部分传递给依赖项解析器以获取新的根(首字)。
从此字典中,我可以获取作为元组的字典对象,但我的依赖项解析器需要字符串中的参数。
这是我完整的代码:
import json
import os
import xml.etree.ElementTree as ET
import itertools
from stanfordcorenlp import StanfordCoreNLP
def feature_extraction():
fes = []
CORE_NLP_DIR = '/Users/ahmed/Desktop/Pycharm/stanford-library'
PARSER = StanfordCoreNLP(CORE_NLP_DIR, memory='8g', lang='en')
props = {'annotators': 'tokenize,pos,lemma,depparse', 'tokenize.whitespace': 'True', 'ssplit.isOneSentence': 'True'}
for f in os.listdir('/Users/ahmed/Desktop/Pycharm/test/fulltext/fulltext/')[0:1]:
root = ET.parse('/Users/ahmed/Desktop/Pycharm/test/fulltext/fulltext/' + f).getroot()
for sent in root.iter('{http://framenet.icsi.berkeley.edu}sentence'):
sent_text = sent.find('{http://framenet.icsi.berkeley.edu}text').text
annotations = sent.findall('{http://framenet.icsi.berkeley.edu}annotationSet')
sent_annotation_info = []
for annotation in annotations:
layers = annotation.findall('{http://framenet.icsi.berkeley.edu}layer')
annotation_info = []
for layer in layers:
target = ''
if layer.get('name') == 'fe':
labels = layer.findall('{http://framenet.icsi.berkeley.edu}label')
fes = [(label.get('name'), label.get('start'), label.get('end'),
sent_text[int(label.get('start')):int(label.get('end')) + 1]) for label in labels]
elif layer.get('name') == 'target':
label = layer.find('{http://framenet.icsi.berkeley.edu}label')
target = (label.get('name'), label.get('start'), label.get('end'),
sent_text[int(label.get('start')):int(label.get('end')) + 1])
if fes and target:
annotation_info.append((target, fes))
# print(annotation_info)
sent_annotation_info.append(annotation_info)
if sent_text != '\n':
parse = json.loads(PARSER.annotate(sent_text, properties=props))
dependencies = get_dependencies(parse['sentences'][0]['basicDependencies'])
parsed_tree = parse_tree(dependencies)
extract_features(sent_annotation_info, parsed_tree, parse, sent_text)
handle_tree(parsed_tree)
print("- =" * 10)
# print(list(powerset(node_list)))
PARSER.close()
def extract_features(annotations, tree, parse, sent_text):
if annotations:
for annotation in annotations:
# print(annotation)
for ((t, s, e, w), fe) in annotation:
for token in parse['sentences'][0]['tokens']:
# print(token)
# print(token['characterOffsetBegin'],token['characterOffsetEnd'],s,e)
# print(s, e)
if token['characterOffsetBegin'] == int(s) and token['characterOffsetEnd'] == int(e) + 1:
# print(token)
print('#' * 25)
print(sent_text)
print(tree)
print('- ' * 10)
def node_feature_extraction(data):
fes = []
CORE_NLP_DIR = '/Users/ahmed/Desktop/Pycharm/stanford-library'
PARSER = StanfordCoreNLP(CORE_NLP_DIR, memory='8g', lang='en')
props = {'annotators': 'tokenize,pos,lemma,depparse', 'tokenize.whitespace': 'True', 'ssplit.isOneSentence': 'True'}
root = ET.parse(data).getroot()
for sent in root.iter('{http://framenet.icsi.berkeley.edu}sentence'):
sent_text = sent.find('{http://framenet.icsi.berkeley.edu}text').text
annotations = sent.findall('{http://framenet.icsi.berkeley.edu}annotationSet')
sent_annotation_info = []
for annotation in annotations:
layers = annotation.findall('{http://framenet.icsi.berkeley.edu}layer')
annotation_info = []
for layer in layers:
target = ''
if layer.get('name') == 'fe':
labels = layer.findall('{http://framenet.icsi.berkeley.edu}label')
fes = [(label.get('name'), label.get('start'), label.get('end'),
sent_text[int(label.get('start')):int(label.get('end')) + 1]) for label in labels]
elif layer.get('name') == 'target':
label = layer.find('{http://framenet.icsi.berkeley.edu}label')
target = (label.get('name'), label.get('start'), label.get('end'),
sent_text[int(label.get('start')):int(label.get('end')) + 1])
if fes and target:
annotation_info.append((target, fes))
# print(annotation_info)
sent_annotation_info.append(annotation_info)
if sent_text != '\n':
parse = json.loads(PARSER.annotate(sent_text, properties=props))
dependencies = get_dependencies(parse['sentences'][0]['basicDependencies'])
parsed_tree = parse_tree(dependencies)
extract_features(sent_annotation_info, parsed_tree, parse, sent_text)
handle_tree(parsed_tree)
print("- =" * 10)
PARSER.close()
def get_dependencies(dependencies):
stanford_dep_sent = {}
for dependency_info in dependencies:
relation_name = dependency_info['dep']
gov_index = int(dependency_info['governor'])
stanford_dep_sent[dependency_info['dependent']] = (relation_name, gov_index)
return stanford_dep_sent
def parse_tree(dependencies):
for d in dependencies:
rel, g = dependencies[d]
if rel == 'ROOT':
return build_node(d, 'ROOT', 'S', dependencies)
def find_dependents2(h, dependencies):
dependents = []
for d in dependencies:
(rel, g) = dependencies[d]
if g == h:
dependents.append((d, rel))
return dependents
def build_node(n, rel, p, dependencies):
global pp
dependents = find_dependents2(n, dependencies)
if not dependents:
return {(n, rel, p): []}
else:
children = []
for (d, rell) in dependents:
if d > n:
pp = 'R'
elif d < n:
pp = 'L'
children.append(build_node(d, rell, pp, dependencies))
return {(n, rel, p): children}
def handle_tree(tree):
for node in tree:
handle_node(node)
for child in tree[node]:
# print(child)
# print("child.....")
handle_tree(child)
# # node_feature_extraction(child)
def handle_node(node):
print(node)
def powerset(iterable):
s = list(iterable)
return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))
if __name__ == "__main__":
feature_extraction()
# dependency_tree_parse()