解析嵌套字典(allen nlp hierplane_tree)

时间:2020-12-19 14:21:49

标签: python json allennlp

我正在尝试解析 allennlp 预测器返回的 JSON 对象。我能够找到一个有用的函数来查找所有子值,但是我真正想要对依赖项做的是给了一个实体“man”,我可以从 JSON 对象中获取关联的属性。


依赖树具有与实体关联的穿衣、蓝色、衬衫等。如何在该结构中为 man 取回关联的 JSON 块?我不确定如何修改我的助手函数或开发另一个函数来从 JSON 输出中获取该块。任何帮助或建议将不胜感激。

AllenNLP 代码:

text = "When I was walking to the park yesterday, I saw a man wearing a blue shirt."
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz")
tree = predictor.predict(sentence=text)

tree = tree['hierplane_tree']


"""Extract nested values from a JSON tree."""

def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values


# Find every instance of `name` in a Python dictionary.
children = json_extract(tree, 'word')

['walking', 'When', 'I', 'was', 'to', 'park', 'the', 'yesterday', ',', 'saw', 'I', 'man', 'a', 'wearing', 'shirt', 'a', 'blue', '.']

JSON Extract(当我提供“man”时我想尝试获得什么:

{'word': 'man',
        'nodeType': 'dep',
        'attributes': ['NOUN'],
        'link': 'dep',
        'spans': [{'start': 51, 'end': 55}],
        'children': [{'word': 'a',
          'nodeType': 'det',
          'attributes': ['DET'],
          'link': 'det',
          'spans': [{'start': 49, 'end': 51}]},
         {'word': 'wearing',
          'nodeType': 'dep',
          'attributes': ['VERB'],
          'link': 'dep',
          'spans': [{'start': 55, 'end': 63}],
          'children': [{'word': 'shirt',
            'nodeType': 'dep',
            'attributes': ['NOUN'],
            'link': 'dep',
            'spans': [{'start': 70, 'end': 76}],
            'children': [{'word': 'a',
              'nodeType': 'dep',
              'attributes': ['DET'],
              'link': 'dep',
              'spans': [{'start': 63, 'end': 65}]},
             {'word': 'blue',
              'nodeType': 'dep',
              'attributes': ['ADJ'],
              'link': 'dep',
              'spans': [{'start': 65, 'end': 70}]}]}]}]}]}]}

JSON 输出:

{'text': 'When I was walking to the park yesterday , I saw a man wearing a blue shirt .',
 'root': {'word': 'walking',
  'nodeType': 'root',
  'attributes': ['VERB'],
  'link': 'root',
  'spans': [{'start': 11, 'end': 19}],
  'children': [{'word': 'When',
    'nodeType': 'dep',
    'attributes': ['ADV'],
    'link': 'dep',
    'spans': [{'start': 0, 'end': 5}]},
   {'word': 'I',
    'nodeType': 'nsubj',
    'attributes': ['PRON'],
    'link': 'nsubj',
    'spans': [{'start': 5, 'end': 7}]},
   {'word': 'was',
    'nodeType': 'aux',
    'attributes': ['AUX'],
    'link': 'aux',
    'spans': [{'start': 7, 'end': 11}]},
   {'word': 'to',
    'nodeType': 'prep',
    'attributes': ['ADP'],
    'link': 'prep',
    'spans': [{'start': 19, 'end': 22}],
    'children': [{'word': 'park',
      'nodeType': 'pobj',
      'attributes': ['NOUN'],
      'link': 'pobj',
      'spans': [{'start': 26, 'end': 31}],
      'children': [{'word': 'the',
        'nodeType': 'det',
        'attributes': ['DET'],
        'link': 'det',
        'spans': [{'start': 22, 'end': 26}]}]}]},
   {'word': 'yesterday',
    'nodeType': 'tmod',
    'attributes': ['NOUN'],
    'link': 'tmod',
    'spans': [{'start': 31, 'end': 41}]},
   {'word': ',',
    'nodeType': 'dep',
    'attributes': ['PUNCT'],
    'link': 'dep',
    'spans': [{'start': 41, 'end': 43}],
    'children': [{'word': 'saw',
      'nodeType': 'dep',
      'attributes': ['VERB'],
      'link': 'dep',
      'spans': [{'start': 45, 'end': 49}],
      'children': [{'word': 'I',
        'nodeType': 'nsubj',
        'attributes': ['PRON'],
        'link': 'nsubj',
        'spans': [{'start': 43, 'end': 45}]},
       {'word': 'man',
        'nodeType': 'dep',
        'attributes': ['NOUN'],
        'link': 'dep',
        'spans': [{'start': 51, 'end': 55}],
        'children': [{'word': 'a',
          'nodeType': 'det',
          'attributes': ['DET'],
          'link': 'det',
          'spans': [{'start': 49, 'end': 51}]},
         {'word': 'wearing',
          'nodeType': 'dep',
          'attributes': ['VERB'],
          'link': 'dep',
          'spans': [{'start': 55, 'end': 63}],
          'children': [{'word': 'shirt',
            'nodeType': 'dep',
            'attributes': ['NOUN'],
            'link': 'dep',
            'spans': [{'start': 70, 'end': 76}],
            'children': [{'word': 'a',
              'nodeType': 'dep',
              'attributes': ['DET'],
              'link': 'dep',
              'spans': [{'start': 63, 'end': 65}]},
             {'word': 'blue',
              'nodeType': 'dep',
              'attributes': ['ADJ'],
              'link': 'dep',
              'spans': [{'start': 65, 'end': 70}]}]}]}]}]}]},
   {'word': '.',
    'nodeType': 'punct',
    'attributes': ['PUNCT'],
    'link': 'punct',
    'spans': [{'start': 76, 'end': 78}]}]},
 'nodeTypeToStyle': {'root': ['color5', 'strong'],
  'dep': ['color5', 'strong'],
  'nsubj': ['color1'],
  'nsubjpass': ['color1'],
  'csubj': ['color1'],
  'csubjpass': ['color1'],
  'pobj': ['color2'],
  'dobj': ['color2'],
  'iobj': ['color2'],
  'mark': ['color2'],
  'pcomp': ['color2'],
  'xcomp': ['color2'],
  'ccomp': ['color2'],
  'acomp': ['color2'],
  'aux': ['color3'],
  'cop': ['color3'],
  'det': ['color3'],
  'conj': ['color3'],
  'cc': ['color3'],
  'prep': ['color3'],
  'number': ['color3'],
  'possesive': ['color3'],
  'poss': ['color3'],
  'discourse': ['color3'],
  'expletive': ['color3'],
  'prt': ['color3'],
  'advcl': ['color3'],
  'mod': ['color4'],
  'amod': ['color4'],
  'tmod': ['color4'],
  'quantmod': ['color4'],
  'npadvmod': ['color4'],
  'infmod': ['color4'],
  'advmod': ['color4'],
  'appos': ['color4'],
  'nn': ['color4'],
  'neg': ['color0'],
  'punct': ['color0']},
 'linkToPosition': {'nsubj': 'left',
  'nsubjpass': 'left',
  'csubj': 'left',
  'csubjpass': 'left',
  'pobj': 'right',
  'dobj': 'right',
  'iobj': 'right',
  'pcomp': 'right',
  'xcomp': 'right',
  'ccomp': 'right',
  'acomp': 'right'}}

1 个答案:

这无疑需要优化和清理,但它确实使您能够通过感兴趣的项目(在本例中为 man)解析来自 AllenNLP 的依赖树。希望这能帮助其他人。

从文本中,通过提供键/值(单词作为键和人作为值)。 你得到:


def get_entity_attributes(obj, key, value):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
        elif isinstance(obj, list):
            for item in obj:
                    ky,vl = key, value
                    if ky in item and vl == item[ky]:
#                         print(type(item), item)
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

def parse_attributes(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

# Create list of word tokens after removing stopwords
def get_clean_list(entities):
    filtered_sentence = []

    for word in entities:
        lexeme = nlp.vocab[word]
        if not lexeme.is_stop and not lexeme.is_punct:
    return filtered_sentence


text = "When I was walking to the park yesterday, I saw a man wearing a blue shirt."
tree = predictor.predict(sentence=text)

key = "word"
entity = "man"
entities = get_entity_attributes(tree, key, entity)

for ent in entities:
    if ent['nodeType'] == 'dep':
        attributes = parse_attributes(ent, key)
        clean_attributes = get_clean_list(attributes)
        print(f'entity: {entity} Attributes: {clean_attributes}')
        attributes = parse_attributes(ent, key)
        clean_attributes = get_clean_list(attributes)
        print(f'entity: {entity} Action Attributes: {clean_attributes}')


entity: man Attributes: ['wearing', 'shirt', 'blue']