将python dict转换为三元组列表时出现问题?

时间:2016-10-18 14:56:07

标签: python json parsing pandas dictionary

我有以下python dict:

{'token_list': [{'quote_level': '0', 'affected_by_negation': 'no', 'token_list': [{'quote_level': '0', 'affected_by_negation': 'no', 'token_list': [{'id': '21', 'analysis_list': [{'tag': 'GNUS3S--', 'lemma': 'Robert Downey Jr', 'original_form': 'Robert Downey Jr'}], 'form': 'Robert Downey Jr', 'type': 'phrase', 'syntactic_tree_relation_list': [{'type': 'isSubject', 'id': '17'}], 'separation': '_', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'quote_level': '0', 'token_list': [{'id': '16', 'analysis_list': [{'tag': 'NPUU-N-', 'sense_id_list': [{'sense_id': '__12123288058840445720'}], 'lemma': 'Robert Downey Jr', 'original_form': 'Robert Downey Jr'}], 'sense_list': [{'info': 'sementity/class=instance@type=Top>Person>FullName@confidence=unknown', 'form': 'Robert Downey Jr', 'id': '__12123288058840445720'}], 'form': 'Robert Downey Jr', 'type': 'multiword', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'separation': '_', 'quote_level': '0', 'topic_list': {'entity_list': [{'form': 'Robert Downey Jr', 'sementity': {'type': 'Top>Person>FullName', 'confidence': 'unknown', 'class': 'instance'}, 'id': '__12123288058840445720'}]}, 'head': '15', 'inip': '0', 'affected_by_negation': 'no', 'endp': '15'}], 'head': '16', 'inip': '0', 'affected_by_negation': 'no', 'endp': '15'}, {'id': '17', 'analysis_list': [{'tag': 'VI-S3PPA-N-N9', 'lemma': 'top', 'original_form': 'has topped'}], 'form': 'has topped', 'type': 'multiword', 'syntactic_tree_relation_list': [{'type': 'iof_isSubject', 'id': '21'}, {'type': 'iof_isDirectObject', 'id': '24'}], 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'quote_level': '0', 'head': '4', 'inip': '17', 'affected_by_negation': 'no', 'endp': '26'}, {'id': '24', 'analysis_list': [{'tag': 'GN-S3D--', 'lemma': 'list', 'original_form': "Forbes magazine's annual list"}], 'form': "Forbes magazine's annual list", 'type': 'phrase', 'syntactic_tree_relation_list': [{'type': 'isDirectObject', 'id': '17'}], 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'quote_level': '0', 'token_list': [{'id': '22', 'analysis_list': [{'tag': 'GN-S3---', 'lemma': 'magazine', 'original_form': 'Forbes magazine'}], 'form': 'Forbes magazine', 'type': 'phrase', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'separation': '1', 'quote_level': '0', 'token_list': [{'quote_level': '0', 'topic_list': {'entity_list': [{'form': 'Forbes', 'semld_list': ['sumo:LastName'], 'sementity': {'type': 'Top>Person>LastName', 'fiction': 'nonfiction', 'id': 'ODENTITY_LAST_NAME', 'class': 'instance'}, 'id': '4a3369b337'}, {'form': 'Forbes', 'semld_list': ['sumo:River'], 'sementity': {'type': 'Top>Location>GeographicalEntity>WaterForm>River', 'fiction': 'nonfiction', 'id': 'ODENTITY_RIVER', 'class': 'instance'}, 'id': '9752b8b5ee'}, {'sementity': {'type': 'Top>Product>CulturalProduct>Printing>Magazine', 'fiction': 'nonfiction', 'id': 'ODENTITY_MAGAZINE', 'class': 'instance'}, 'semgeo_list': [{'country': {'form': 'United States', 'standard_list': [{'value': 'US', 'id': 'ISO3166-1-a2'}, {'value': 'USA', 'id': 'ISO3166-1-a3'}], 'id': 'beac1b545b'}, 'continent': {'form': 'AmĂŠrica', 'id': '33fc13e6dd'}}], 'semtheme_list': [{'type': 'Top>SocialSciences>Economy', 'id': 'ODTHEME_ECONOMY'}], 'semld_list': ['sumo:Magazine'], 'form': 'Forbes', 'id': 'db0f9829ff'}]}, 'analysis_list': [{'tag': 'NP-S-N-', 'sense_id_list': [{'sense_id': 'db0f9829ff'}], 'lemma': 'Forbes', 'original_form': 'Forbes'}, {'tag': 'NP-S-N-', 'sense_id_list': [{'sense_id': '9752b8b5ee'}], 'lemma': 'Forbes', 'original_form': 'Forbes'}, {'tag': 'NPUS-N-', 'sense_id_list': [{'sense_id': '4a3369b337'}], 'lemma': 'Forbes', 'original_form': 'Forbes'}], 'separation': '1', 'sense_list': [{'info': 'sementity/class=instance@fiction=nonfiction@id=ODENTITY_LAST_NAME@type=Top>Person>LastName\tsemld_list=sumo:LastName', 'form': 'Forbes', 'id': '4a3369b337'}, {'info': 'sementity/class=instance@fiction=nonfiction@id=ODENTITY_RIVER@type=Top>Location>GeographicalEntity>WaterForm>River\tsemld_list=sumo:River', 'form': 'Forbes', 'id': '9752b8b5ee'}, {'info': 'sementity/class=instance@fiction=nonfiction@id=ODENTITY_MAGAZINE@type=Top>Product>CulturalProduct>Printing>Magazine\tsemgeo_list/continent=AmĂŠrica#id:33fc13e6dd@country=United States#id:beac1b545b#ISO3166-1-a2:US#ISO3166-1-a3:USA\tsemld_list=sumo:Magazine\tsemtheme_list/id=ODTHEME_ECONOMY@type=Top>SocialSciences>Economy', 'form': 'Forbes', 'id': 'db0f9829ff'}], 'inip': '28', 'form': 'Forbes', 'affected_by_negation': 'no', 'endp': '33', 'id': '6', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'quote_level': '0', 'analysis_list': [{'tag': 'NC-S-N5', 'sense_id_list': [{'sense_id': 'a0a1a5401f'}], 'lemma': 'magazine', 'original_form': 'magazine'}], 'separation': '1', 'sense_list': [{'info': 'sementity/class=class@fiction=nonfiction@id=ODENTITY_MAGAZINE@type=Top>Product>CulturalProduct>Printing>Magazine\tsemld_list=sumo:Magazine', 'form': 'magazine', 'id': 'a0a1a5401f'}], 'inip': '35', 'form': 'magazine', 'affected_by_negation': 'no', 'endp': '42', 'id': '7', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}], 'head': '7', 'inip': '28', 'affected_by_negation': 'no', 'endp': '42'}, {'quote_level': '0', 'analysis_list': [{'tag': 'WN-', 'lemma': "'s", 'original_form': "'s"}], 'separation': 'A', 'inip': '43', 'form': "'s", 'affected_by_negation': 'no', 'endp': '44', 'id': '14', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'id': '23', 'analysis_list': [{'tag': 'GN-S3---', 'lemma': 'list', 'original_form': 'annual list'}], 'form': 'annual list', 'type': 'phrase', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'separation': '1', 'quote_level': '0', 'token_list': [{'quote_level': '0', 'analysis_list': [{'tag': 'AP-N5', 'lemma': 'annual', 'original_form': 'annual'}], 'separation': '1', 'inip': '46', 'form': 'annual', 'affected_by_negation': 'no', 'endp': '51', 'id': '10', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'quote_level': '0', 'analysis_list': [{'tag': 'NC-S-N5', 'lemma': 'list', 'original_form': 'list'}], 'separation': '1', 'inip': '53', 'form': 'list', 'affected_by_negation': 'no', 'endp': '56', 'id': '11', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}], 'head': '11', 'inip': '46', 'affected_by_negation': 'no', 'endp': '56'}], 'head': '23', 'inip': '28', 'affected_by_negation': 'no', 'endp': '56'}], 'separation': '_', 'analysis_list': [{'tag': 'Z-----------', 'lemma': '*', 'original_form': "Robert Downey Jr has topped Forbes magazine's annual list"}], 'inip': '0', 'form': "Robert Downey Jr has topped Forbes magazine's annual list", 'type': 'phrase', 'endp': '56', 'id': '25', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'quote_level': '0', 'analysis_list': [{'tag': '1D--', 'lemma': '.', 'original_form': '.'}], 'separation': 'A', 'inip': '57', 'form': '.', 'affected_by_negation': 'no', 'endp': '57', 'id': '12', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}], 'separation': 'A', 'inip': '0', 'endp': '57', 'type': 'sentence', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'id': '18'}], 'status': {'credits': '1', 'remaining_credits': '39848', 'code': '0', 'msg': 'OK'}}

如何在新元组中提取所有analysis_list个键及其各自的值?:

((NPUU-N-, Robert Downey Jr, Robert Downey Jr),(NPUU-N-, Robert Downey Jr, Robert Downey Jr), (VI-S3PPA-N-N9, top, has topped'), (GN-S3D--, list, Forbes magazine's annual list), (GN-S3---, magazine, 'original_form': 'Forbes magazine'), (NP-S-N-, Forbes, Forbes), ..., (1D--, ., .))

我用pandas尝试了以下内容:

在:

df = json_normalize(data['token_list'])
data = df['token_list'].to_dict()
data=data.values()
print(data)

出:

dict_values([[{'quote_level': '0', 'analysis_list': [{'tag': 'Z-----------', 'lemma': '*', 'original_form': "Robert Downey Jr has topped Forbes magazine's annual list"}], 'token_list': [{'id': '21', 'analysis_list': [{'tag': 'GNUS3S--', 'lemma': 'Robert Downey Jr', 'original_form': 'Robert Downey Jr'}], 'form': 'Robert Downey Jr', 'type': 'phrase', 'syntactic_tree_relation_list': [{'type': 'isSubject', 'id': '17'}], 'separation': '_', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'quote_level': '0', 'token_list': [{'id': '16', 'analysis_list': [{'tag': 'NPUU-N-', 'sense_id_list': [{'sense_id': '__12123288058840445720'}], 'lemma': 'Robert Downey Jr', 'original_form': 'Robert Downey Jr'}], 'sense_list': [{'info': 'sementity/class=instance@type=Top>Person>FullName@confidence=unknown', 'form': 'Robert Downey Jr', 'id': '__12123288058840445720'}], 'form': 'Robert Downey Jr', 'type': 'multiword', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'separation': '_', 'quote_level': '0', 'topic_list': {'entity_list': [{'form': 'Robert Downey Jr', 'sementity': {'type': 'Top>Person>FullName', 'confidence': 'unknown', 'class': 'instance'}, 'id': '__12123288058840445720'}]}, 'head': '15', 'inip': '0', 'affected_by_negation': 'no', 'endp': '15'}], 'head': '16', 'inip': '0', 'affected_by_negation': 'no', 'endp': '15'}, {'id': '17', 'analysis_list': [{'tag': 'VI-S3PPA-N-N9', 'lemma': 'top', 'original_form': 'has topped'}], 'form': 'has topped', 'type': 'multiword', 'syntactic_tree_relation_list': [{'type': 'iof_isSubject', 'id': '21'}, {'type': 'iof_isDirectObject', 'id': '24'}], 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'quote_level': '0', 'head': '4', 'inip': '17', 'affected_by_negation': 'no', 'endp': '26'}, {'id': '24', 'analysis_list': [{'tag': 'GN-S3D--', 'lemma': 'list', 'original_form': "Forbes magazine's annual list"}], 'form': "Forbes magazine's annual list", 'type': 'phrase', 'syntactic_tree_relation_list': [{'type': 'isDirectObject', 'id': '17'}], 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'quote_level': '0', 'token_list': [{'id': '22', 'analysis_list': [{'tag': 'GN-S3---', 'lemma': 'magazine', 'original_form': 'Forbes magazine'}], 'form': 'Forbes magazine', 'type': 'phrase', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'separation': '1', 'quote_level': '0', 'token_list': [{'quote_level': '0', 'topic_list': {'entity_list': [{'form': 'Forbes', 'semld_list': ['sumo:LastName'], 'sementity': {'type': 'Top>Person>LastName', 'fiction': 'nonfiction', 'id': 'ODENTITY_LAST_NAME', 'class': 'instance'}, 'id': '4a3369b337'}, {'form': 'Forbes', 'semld_list': ['sumo:River'], 'sementity': {'type': 'Top>Location>GeographicalEntity>WaterForm>River', 'fiction': 'nonfiction', 'id': 'ODENTITY_RIVER', 'class': 'instance'}, 'id': '9752b8b5ee'}, {'sementity': {'type': 'Top>Product>CulturalProduct>Printing>Magazine', 'fiction': 'nonfiction', 'id': 'ODENTITY_MAGAZINE', 'class': 'instance'}, 'id': 'db0f9829ff', 'semgeo_list': [{'country': {'form': 'United States', 'standard_list': [{'value': 'US', 'id': 'ISO3166-1-a2'}, {'value': 'USA', 'id': 'ISO3166-1-a3'}], 'id': 'beac1b545b'}, 'continent': {'form': 'AmĂŠrica', 'id': '33fc13e6dd'}}], 'semld_list': ['sumo:Magazine'], 'semtheme_list': [{'type': 'Top>SocialSciences>Economy', 'id': 'ODTHEME_ECONOMY'}], 'form': 'Forbes'}]}, 'analysis_list': [{'tag': 'NP-S-N-', 'sense_id_list': [{'sense_id': 'db0f9829ff'}], 'lemma': 'Forbes', 'original_form': 'Forbes'}, {'tag': 'NP-S-N-', 'sense_id_list': [{'sense_id': '9752b8b5ee'}], 'lemma': 'Forbes', 'original_form': 'Forbes'}, {'tag': 'NPUS-N-', 'sense_id_list': [{'sense_id': '4a3369b337'}], 'lemma': 'Forbes', 'original_form': 'Forbes'}], 'id': '6', 'sense_list': [{'info': 'sementity/class=instance@fiction=nonfiction@id=ODENTITY_LAST_NAME@type=Top>Person>LastName\tsemld_list=sumo:LastName', 'form': 'Forbes', 'id': '4a3369b337'}, {'info': 'sementity/class=instance@fiction=nonfiction@id=ODENTITY_RIVER@type=Top>Location>GeographicalEntity>WaterForm>River\tsemld_list=sumo:River', 'form': 'Forbes', 'id': '9752b8b5ee'}, {'info': 'sementity/class=instance@fiction=nonfiction@id=ODENTITY_MAGAZINE@type=Top>Product>CulturalProduct>Printing>Magazine\tsemgeo_list/continent=AmĂŠrica#id:33fc13e6dd@country=United States#id:beac1b545b#ISO3166-1-a2:US#ISO3166-1-a3:USA\tsemld_list=sumo:Magazine\tsemtheme_list/id=ODTHEME_ECONOMY@type=Top>SocialSciences>Economy', 'form': 'Forbes', 'id': 'db0f9829ff'}], 'inip': '28', 'form': 'Forbes', 'affected_by_negation': 'no', 'endp': '33', 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'quote_level': '0', 'analysis_list': [{'tag': 'NC-S-N5', 'sense_id_list': [{'sense_id': 'a0a1a5401f'}], 'lemma': 'magazine', 'original_form': 'magazine'}], 'id': '7', 'sense_list': [{'info': 'sementity/class=class@fiction=nonfiction@id=ODENTITY_MAGAZINE@type=Top>Product>CulturalProduct>Printing>Magazine\tsemld_list=sumo:Magazine', 'form': 'magazine', 'id': 'a0a1a5401f'}], 'inip': '35', 'form': 'magazine', 'affected_by_negation': 'no', 'endp': '42', 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}], 'head': '7', 'inip': '28', 'affected_by_negation': 'no', 'endp': '42'}, {'quote_level': '0', 'analysis_list': [{'tag': 'WN-', 'lemma': "'s", 'original_form': "'s"}], 'id': '14', 'inip': '43', 'form': "'s", 'affected_by_negation': 'no', 'endp': '44', 'separation': 'A', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'id': '23', 'analysis_list': [{'tag': 'GN-S3---', 'lemma': 'list', 'original_form': 'annual list'}], 'form': 'annual list', 'type': 'phrase', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}, 'separation': '1', 'quote_level': '0', 'token_list': [{'quote_level': '0', 'analysis_list': [{'tag': 'AP-N5', 'lemma': 'annual', 'original_form': 'annual'}], 'id': '10', 'inip': '46', 'form': 'annual', 'affected_by_negation': 'no', 'endp': '51', 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'quote_level': '0', 'analysis_list': [{'tag': 'NC-S-N5', 'lemma': 'list', 'original_form': 'list'}], 'id': '11', 'inip': '53', 'form': 'list', 'affected_by_negation': 'no', 'endp': '56', 'separation': '1', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}], 'head': '11', 'inip': '46', 'affected_by_negation': 'no', 'endp': '56'}], 'head': '23', 'inip': '28', 'affected_by_negation': 'no', 'endp': '56'}], 'id': '25', 'type': 'phrase', 'inip': '0', 'form': "Robert Downey Jr has topped Forbes magazine's annual list", 'affected_by_negation': 'no', 'endp': '56', 'separation': '_', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}, {'quote_level': '0', 'analysis_list': [{'tag': '1D--', 'lemma': '.', 'original_form': '.'}], 'id': '12', 'inip': '57', 'form': '.', 'affected_by_negation': 'no', 'endp': '57', 'separation': 'A', 'style': {'isBold': 'no', 'isTitle': 'no', 'isItalics': 'no', 'isUnderlined': 'no'}}]])

另外我试过了:

myvalues = [i['analysis_list'] for i in data if 'analysis_list' in i]
print(myvalues)

但是,我对这么多的键和值感到困惑,这是从这个字典中生成元组的推荐方法吗?我正在考虑使用熊猫或其他替代方法......

1 个答案:

答案 0 :(得分:1)

您可以使用此代码:

def gettuples(data, level = 0):
    if isinstance(data, dict):
        if 'analysis_list' in data:
            yield data['analysis_list'][0]
        for val in data.values():
            yield from gettuples(val)
    elif isinstance(data, list):
        for val in data:
            yield from gettuples(val)

result = [[obj['lemma'], obj['original_form'], obj['tag']] for obj in gettuples(data)]
print(result)

repl.it

上查看它