我必须在Python中创建一个脚本,该脚本允许我替换json文件中的字符串。该文件包含专利信息,例如:
{
"US-8163793-B2": {
"publication_date": "20120424",
"priority_date": "20090420",
"family_id": "42261969",
"country_code": "US",
"ipc_code": "C07D417/14",
"cpc_code": "C07D471/04",
"assignee_name": "Hoffman-La Roche Inc.",
"title": "Proline derivatives",
"abstract": "The invention relates to a compound of formula (I) wherein A, R 1 -R 6 are as defined in the description and in the claims. The compound of formula (I) can be used as a medicament."
}
但是,大约有15,000个条目。为了规范化此文档,在执行单词嵌入之前,我使用了在发现的术语中包含标签的软件。输出看起来像这样:
"Row_1" : {
"COMPANY": [
{
"hitCount": 1,
"sourceTitle": "",
"sourceID": "",
"docTitle": "",
"docID": "Row_1",
"hitID": "COMP642",
"name": "Roche",
"frag_vector_array": [
"16#Hoffman-La {!Roche!} Inc."
],
"totnosyns": 1,
"goodSynCount": 1,
"nonambigsyns": 1,
"score": 1,
"hit_loc_vector": [
16
],
"word_pos_array": [
2
],
"exact_string": "16#90-95",
"exact_array": [
{
"fls": [
16,
90,
95
]
}
],
"entityType": "COMPANY",
"realSynList": [
"Roche"
],
"dictSynList": [
"roche"
],
"kvp": {
"entityType": "COMPANY"
},
"rejected": false,
"entityMeta": {
"_ext_name": "Wikipedia",
"_ext_uri": "http://en.wikipedia.org/wiki/Roche",
"_termite_id": "TCP000392"
},
"section_vector": [
8
],
"dependencyMet": true,
"fuzzyMatches": 0,
"sectionMeta": {
"8": "assignee_name|"
}
}
]
}
此输出也是一个json文件,将用作字典。
我需要的是,每次在专利文件中出现"name"
(例如"Roche"
等术语,就用"hitID"
(例如"COMP642"
)代替
我在Python方面非常陌生,因此任何帮助或阅读建议都会有很大帮助。
谢谢!
编辑
到目前为止尝试了什么
with open(file, "rb") as datafile:
json_data = json.loads(datafile.read().decode("utf-8")) # type: object
for paper in json_data:
termite_dict = dict()
termite_dict_all_per_pmid = list()
pmid = int(paper["docID"])
abstract = paper["abstract"]
gene_list = list()
indication_mesh_list = list()
drug_list = list()
mirna_list = list()
company_list = list()
bioproc_list = list()
protype_list = list()
if "termiteTags" in paper:
for termite_tag in paper["termiteTags"]:
type_entry = termite_tag["entityType"]
termite_dict = dict()
name = termite_tag["name"]
exact_tag_locations = termite_tag["exact_string"].split(",")
relevant_tag_locations = list()
words_to_replace = list()
# process and store termite annotations
if type_entry == "GENE":
gene_list.append({"Gene": termite_tag["hitID"]})
elif type_entry == "INDICATION":
info = termite_tag["entityMeta"]
if "mesh_tree" in info:
for e in list(filter(None, termite_tag["entityMeta"]["mesh_tree"].split(";"))):
try:
mesh_id = mesh_tree_nr_to_id_dict[e]
mesh_name = mesh_id_to_name_dict[mesh_id]
indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": e})
except KeyError:
continue
elif "_ext_uri" in info:
url = termite_tag["entityMeta"]["_ext_uri"]
try:
mesh_id = url.split("term=")[1]
mesh_name = mesh_id_to_name_dict[mesh_id]
mesh_tree_nr = name_to_mesh_id_dict[mesh_name]
indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": mesh_tree_nr})
except KeyError:
print("Issue with Mesh key indication")
elif type_entry == "DRUG":
drug_list.append(termite_tag["name"])
elif type_entry == "MIRNA":
mirna_list.append(termite_tag["hitID"])
elif type_entry == "COMPANY":
company_list.append(termite_tag["name"])
elif type_entry == "BIOPROC":
bioproc_list.append(termite_tag["name"])
elif type_entry == "PROTYP":
protype_list.append(termite_tag["name"])
# store info for positions with words to normalize in abstract text
for hit_number, hit in enumerate(termite_tag["frag_vector_array"]):
hit = hit.replace("\n", " ")
try:
match = re.match(r"^.*{!(.*)!}.*$", hit)
match_word = match.group(1)
except AttributeError:
try:
match = re.match(r"^.*{\*(.*)\*\}.*$", hit)
match_word = match.group(1)
except AttributeError:
print(hit)
if match_word.lower() != name.lower():
exact_locus = exact_tag_locations[hit_number]
if not exact_locus.startswith("-"):
# sentence 0 is paper title
if not exact_locus.startswith("0"):
relevant_tag_locations.append(exact_tag_locations[hit_number])
words_to_replace.append(match_word)
termite_dict["norm"] = name
termite_dict["replace"] = match_word
fr, t = exact_locus.split("#")[1].split("-")
termite_dict["from"] = int(fr)
termite_dict["to"] = int(t)
termite_dict["len"] = int(t) - int(fr)
termite_dict["entityCode"] = entity_type_encoder[termite_tag["entityType"]]
termite_dict_all_per_pmid.append(termite_dict)
termite_dict = dict()
# abstract normalization and bag of words calculations
if len(termite_dict_all_per_pmid) > 0:
sorted_termite_dict_all_per_pmid = sorted(termite_dict_all_per_pmid,
key=lambda k: (k['from'], -k["len"], k["entityCode"]))
normalized_abstract = normalize_abstract(sorted_termite_dict_all_per_pmid, abstract)
termite_dict["Norm_Abstract"] = normalized_abstract
cleaned_abstract_text = abstract_to_words(normalized_abstract)
termite_dict["bag_of_words"] = list(set(cleaned_abstract_text))
termite_dict["docID"] = pmid
if "keywords" in paper:
keywords = [w.strip() for w in paper["keywords"].split(";")]
mesh_list = list()
for word in keywords:
if len(word.split(" ")) == 1 and len(word) > 0 and word[0].islower():
word = word.title()
if word in name_to_mesh_id_dict:
mesh_id = name_to_mesh_id_dict[word]
try:
mesh_list.append([word, mesh_id, mesh_id_to_tree_nr_dict[mesh_id]])
except KeyError:
mesh_list.append([word, mesh_id, ""])
termite_dict["MeshHeadings"] = mesh_list
if len(gene_list) > 0:
termite_dict["Genes"] = gene_list
if len(indication_mesh_list) > 0:
termite_dict["Indications"] = indication_mesh_list
if len(drug_list) > 0:
termite_dict["Drug"] = drug_list
if len(mirna_list) > 0:
termite_dict["MIRNA"] = mirna_list
if len(company_list) > 0:
termite_dict["Company"] = company_list
if len(bioproc_list) > 0:
termite_dict["Bioproc"] = bioproc_list
if len(protype_list) > 0:
termite_dict["Protyp"] = protype_list
# add meta list to be able to query for gene nd indication co-occurrence
meta_list = list()
if "Indications" in termite_dict:
meta_list.extend([indi["key"] for indi in termite_dict["Indications"]])
if "Genes" in termite_dict:
meta_list.extend([gene["Gene"] for gene in termite_dict["Genes"]])
if len(meta_list) > 0:
termite_dict["all_genes_indications"] = meta_list
termite_dict_list.append(termite_dict)
return termite_dict_list
答案 0 :(得分:0)
如果我要遵循的要求,我想您希望根据公司数据"assignee_name"
用公司数据中的相应"hitID"
替换专利数据中的"name"
被包含在专利数据"assignee_name"
中的某个地方。
几个循环应该可以解决问题(尽管我确信有一种更优雅的方法)。当然,如果您需要更复杂的方法来确定公司数据中的"name"
是否真的与专利数据中的"assignee_name"
相匹配,则可以在此方法中添加一些正则表达式等,但是这应该使您指向正确的方向。
import json
patents = json.loads("""{
"US-8163793-B2": {
"publication_date": "20120424",
"assignee_name": "Hoffman-La Roche Inc."
},
"US-1234567-A1": {
"publication_date": "20010101",
"assignee_name": "ABC Inc."
}
}""")
companies = json.loads("""{
"Row_1": {
"COMPANY": [
{
"hitID": "COMP642",
"name": "Roche"
}
]
},
"Row_2": {
"COMPANY": [
{
"hitID": "COMP123",
"name": "ABC"
}
]
}
}""")
# loop through companies data
for company in companies.values():
company_id = company['COMPANY'][0]['hitID']
company_name = company['COMPANY'][0]['name']
# update patents where company "name" included in "assignee_name"
for patent in patents.values():
if company_name in patent['assignee_name']:
patent['assignee_name'] = company_id
print(patents)
# OUTPUT (use json.dump to write to file if needed)
#
# {
# 'US-1234567-A1': {'assignee_name': 'COMP123', 'publication_date': '20010101'},
# 'US-8163793-B2': {'assignee_name': 'COMP642', 'publication_date': '20120424'}
# }