Python:比较两个json文件并替换相似的字符串

时间:2018-08-16 09:39:06

标签: python json

我必须在Python中创建一个脚本,该脚本允许我替换json文件中的字符串。该文件包含专利信息,例如:

{
  "US-8163793-B2": {
    "publication_date": "20120424",
    "priority_date": "20090420",
    "family_id": "42261969",
    "country_code": "US",
    "ipc_code": "C07D417/14",
    "cpc_code": "C07D471/04",
    "assignee_name": "Hoffman-La Roche Inc.",
    "title": "Proline derivatives",
    "abstract": "The invention relates to a compound of formula (I) wherein A, R 1 -R 6  are as defined in the description and in the claims. The compound of formula (I) can be used as a medicament."
  }

但是,大约有15,000个条目。为了规范化此文档,在执行单词嵌入之前,我使用了在发现的术语中包含标签的软件。输出看起来像这样:

 "Row_1" : {
  "COMPANY": [
    {
      "hitCount": 1,
      "sourceTitle": "",
      "sourceID": "",
      "docTitle": "",
      "docID": "Row_1",
      "hitID": "COMP642",
      "name": "Roche",
      "frag_vector_array": [
        "16#Hoffman-La {!Roche!} Inc."
      ],
      "totnosyns": 1,
      "goodSynCount": 1,
      "nonambigsyns": 1,
      "score": 1,
      "hit_loc_vector": [
        16
      ],
      "word_pos_array": [
        2
      ],
      "exact_string": "16#90-95",
      "exact_array": [
        {
          "fls": [
            16,
            90,
            95
          ]
        }
      ],
      "entityType": "COMPANY",
      "realSynList": [
        "Roche"
      ],
      "dictSynList": [
        "roche"
      ],
      "kvp": {
        "entityType": "COMPANY"
      },
      "rejected": false,
      "entityMeta": {
        "_ext_name": "Wikipedia",
        "_ext_uri": "http://en.wikipedia.org/wiki/Roche",
        "_termite_id": "TCP000392"
      },
      "section_vector": [
        8
      ],
      "dependencyMet": true,
      "fuzzyMatches": 0,
      "sectionMeta": {
        "8": "assignee_name|"
      }
    }
  ]
}

此输出也是一个json文件,将用作字典。

我需要的是,每次在专利文件中出现"name"(例如"Roche"等术语,就用"hitID"(例如"COMP642")代替

我在Python方面非常陌生,因此任何帮助或阅读建议都会有很大帮助。

谢谢!

编辑

到目前为止尝试了什么

 with open(file, "rb") as datafile:
        json_data = json.loads(datafile.read().decode("utf-8"))  # type: object

        for paper in json_data:

            termite_dict = dict()
            termite_dict_all_per_pmid = list()
            pmid = int(paper["docID"])
            abstract = paper["abstract"]

            gene_list = list()
            indication_mesh_list = list()
            drug_list = list()
            mirna_list = list()
            company_list = list()
            bioproc_list = list()
            protype_list = list()

            if "termiteTags" in paper:
                for termite_tag in paper["termiteTags"]:
                    type_entry = termite_tag["entityType"]

                    termite_dict = dict()
                    name = termite_tag["name"]
                    exact_tag_locations = termite_tag["exact_string"].split(",")
                    relevant_tag_locations = list()
                    words_to_replace = list()

                    # process and store termite annotations
                    if type_entry == "GENE":
                        gene_list.append({"Gene": termite_tag["hitID"]})
                    elif type_entry == "INDICATION":
                        info = termite_tag["entityMeta"]
                        if "mesh_tree" in info:
                            for e in list(filter(None, termite_tag["entityMeta"]["mesh_tree"].split(";"))):
                                try:
                                    mesh_id = mesh_tree_nr_to_id_dict[e]
                                    mesh_name = mesh_id_to_name_dict[mesh_id]
                                    indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": e})
                                except KeyError:
                                    continue
                        elif "_ext_uri" in info:
                            url = termite_tag["entityMeta"]["_ext_uri"]
                            try:
                                mesh_id = url.split("term=")[1]
                                mesh_name = mesh_id_to_name_dict[mesh_id]
                                mesh_tree_nr = name_to_mesh_id_dict[mesh_name]
                                indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": mesh_tree_nr})
                            except KeyError:
                                print("Issue with Mesh key indication")
                    elif type_entry == "DRUG":
                        drug_list.append(termite_tag["name"])
                    elif type_entry == "MIRNA":
                        mirna_list.append(termite_tag["hitID"])
                    elif type_entry == "COMPANY":
                        company_list.append(termite_tag["name"])
                    elif type_entry == "BIOPROC":
                        bioproc_list.append(termite_tag["name"])
                    elif type_entry == "PROTYP":
                        protype_list.append(termite_tag["name"])

                    # store info for positions with words to normalize in abstract text
                    for hit_number, hit in enumerate(termite_tag["frag_vector_array"]):
                        hit = hit.replace("\n", " ")

                        try:
                            match = re.match(r"^.*{!(.*)!}.*$", hit)
                            match_word = match.group(1)
                        except AttributeError:
                            try:
                                match = re.match(r"^.*{\*(.*)\*\}.*$", hit)
                                match_word = match.group(1)
                            except AttributeError:
                                print(hit)

                        if match_word.lower() != name.lower():
                            exact_locus = exact_tag_locations[hit_number]
                            if not exact_locus.startswith("-"):
                                # sentence 0 is paper title
                                if not exact_locus.startswith("0"):
                                    relevant_tag_locations.append(exact_tag_locations[hit_number])
                                    words_to_replace.append(match_word)
                                    termite_dict["norm"] = name
                                    termite_dict["replace"] = match_word
                                    fr, t = exact_locus.split("#")[1].split("-")
                                    termite_dict["from"] = int(fr)
                                    termite_dict["to"] = int(t)
                                    termite_dict["len"] = int(t) - int(fr)
                                    termite_dict["entityCode"] = entity_type_encoder[termite_tag["entityType"]]
                                    termite_dict_all_per_pmid.append(termite_dict)
                                    termite_dict = dict()

            # abstract normalization and bag of words calculations
            if len(termite_dict_all_per_pmid) > 0:
                sorted_termite_dict_all_per_pmid = sorted(termite_dict_all_per_pmid,
                                                          key=lambda k: (k['from'], -k["len"], k["entityCode"]))
                normalized_abstract = normalize_abstract(sorted_termite_dict_all_per_pmid, abstract)
                termite_dict["Norm_Abstract"] = normalized_abstract
                cleaned_abstract_text = abstract_to_words(normalized_abstract)
                termite_dict["bag_of_words"] = list(set(cleaned_abstract_text))

            termite_dict["docID"] = pmid

            if "keywords" in paper:
                keywords = [w.strip() for w in paper["keywords"].split(";")]
                mesh_list = list()

                for word in keywords:
                    if len(word.split(" ")) == 1 and len(word) > 0 and word[0].islower():
                        word = word.title()
                    if word in name_to_mesh_id_dict:
                        mesh_id = name_to_mesh_id_dict[word]
                        try:
                            mesh_list.append([word, mesh_id, mesh_id_to_tree_nr_dict[mesh_id]])
                        except KeyError:
                            mesh_list.append([word, mesh_id, ""])
                termite_dict["MeshHeadings"] = mesh_list

            if len(gene_list) > 0:
                termite_dict["Genes"] = gene_list
            if len(indication_mesh_list) > 0:
                termite_dict["Indications"] = indication_mesh_list
            if len(drug_list) > 0:
                termite_dict["Drug"] = drug_list
            if len(mirna_list) > 0:
                termite_dict["MIRNA"] = mirna_list
            if len(company_list) > 0:
                termite_dict["Company"] = company_list
            if len(bioproc_list) > 0:
                termite_dict["Bioproc"] = bioproc_list
            if len(protype_list) > 0:
                termite_dict["Protyp"] = protype_list

            # add meta list to be able to query for gene nd indication co-occurrence
            meta_list = list()
            if "Indications" in termite_dict:
                meta_list.extend([indi["key"] for indi in termite_dict["Indications"]])
            if "Genes" in termite_dict:
                meta_list.extend([gene["Gene"] for gene in termite_dict["Genes"]])
            if len(meta_list) > 0:
                termite_dict["all_genes_indications"] = meta_list

            termite_dict_list.append(termite_dict)
    return termite_dict_list

1 个答案:

答案 0 :(得分:0)

如果我要遵循的要求,我想您希望根据公司数据"assignee_name"用公司数据中的相应"hitID"替换专利数据中的"name"被包含在专利数据"assignee_name"中的某个地方。

几个循环应该可以解决问题(尽管我确信有一种更优雅的方法)。当然,如果您需要更复杂的方法来确定公司数据中的"name"是否真的与专利数据中的"assignee_name"相匹配,则可以在此方法中添加一些正则表达式等,但是这应该使您指向正确的方向。

import json

patents = json.loads("""{
        "US-8163793-B2": {
            "publication_date": "20120424",
            "assignee_name": "Hoffman-La Roche Inc."
        },
        "US-1234567-A1": {
            "publication_date": "20010101",
            "assignee_name": "ABC Inc."
        }
    }""")

companies = json.loads("""{
        "Row_1": {
            "COMPANY": [
                {
                    "hitID": "COMP642",
                    "name": "Roche"
                }
            ]
        },
        "Row_2": {
            "COMPANY": [
                {
                    "hitID": "COMP123",
                    "name": "ABC"
                }
            ]
        }
    }""")

# loop through companies data
for company in companies.values():
    company_id = company['COMPANY'][0]['hitID']
    company_name = company['COMPANY'][0]['name']

    # update patents where company "name" included in "assignee_name"
    for patent in patents.values():
        if company_name in patent['assignee_name']:
            patent['assignee_name'] = company_id

print(patents)

# OUTPUT (use json.dump to write to file if needed)
#
# {
#     'US-1234567-A1': {'assignee_name': 'COMP123', 'publication_date': '20010101'},
#     'US-8163793-B2': {'assignee_name': 'COMP642', 'publication_date': '20120424'}
# }