我有一个包含448条记录的.json文件。该文件是这样的格式 - 2记录为样本数据
{
"_id" : ObjectId("5a5faa4f8b91277fde0212b1"),
"geo_accession" : [
"GSE86910"
],
"title" : [
"RNA-seq transcriptonal profiling in human primary adult erythroid progenitor celression"
],
"summary" : [
"The developing erythroid cerythroid cells, and performed RNA-seq transcriptional profiling analysis."
],
"num_samples" : 6,
"overall_design" : [
"Human primary adult erythroblasts were generated ex vivo from extracted for RNA-seq analysis."
],
"samples" : {
"GSM2310252" : {
"title" : "RNAseq_A5-ProE-shNT-rep1",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310253" : {
"title" : "RNAseq_A5-ProE-shNT-rep2",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310254" : {
"title" : "RNAseq_A5-ProE-shTFAM-rep1",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310255" : {
"title" : "RNAseq_A5-ProE-shTFAM-rep2",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310256" : {
"title" : "RNAseq_A5-ProE-shPHB2-rep1",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310257" : {
"title" : "RNAseq_A5-ProE-shPHB2-rep2",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
}
},
"geo_signal" : {}
}
{
"_id" : ObjectId("5a5faa4f8b91277fde0212b6"),
"geo_accession" : [
"GSE83592"
],
"title" : [
"JQ1 +/- Vemurafenib in BRAF mutant melanoma (A375)"
],
"summary" : [
"The apoptotic genes significantly down-regulated."
],
"num_samples" : 2,
"overall_design" : [
"dsf"
],
"samples" : {
"GSM2210563" : {
"title" : "16L",
"source_name_ch1" : "A375 cell line",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2210564" : {
"title" : "16R",
"source_name_ch1" : "A375 cell line",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
},
"geo_signal" : {}
}
现在我对这种格式完全没问题,但显然json.load对这种格式不起作用并且给出了错误
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 2 column 1
这是否可以将所有这些转换为json格式的记录列表,如此
[
{
"_id" : ObjectId("5a5faa4f8b91277fde0212b1"),
"geo_accession" : [
"GSE86910"
],
"title" : [
"RNA-seq transcriptonal profiling in human primary adult erythroid progenitor celression"
],
"summary" : [
"The developing erythroid cerythroid cells, and performed RNA-seq transcriptional profiling analysis."
],
"num_samples" : 6,
"overall_design" : [
"Human primary adult erythroblasts were generated ex vivo from extracted for RNA-seq analysis."
],
"samples" : {
"GSM2310252" : {
"title" : "RNAseq_A5-ProE-shNT-rep1",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310253" : {
"title" : "RNAseq_A5-ProE-shNT-rep2",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310254" : {
"title" : "RNAseq_A5-ProE-shTFAM-rep1",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310255" : {
"title" : "RNAseq_A5-ProE-shTFAM-rep2",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310256" : {
"title" : "RNAseq_A5-ProE-shPHB2-rep1",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2310257" : {
"title" : "RNAseq_A5-ProE-shPHB2-rep2",
"treatment_protocol_ch1" : "NA",
"source_name_ch1" : "Human primary adult proerythroblasts (ProEs)",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
}
},
"geo_signal" : {}
}
{
"_id" : ObjectId("5a5faa4f8b91277fde0212b6"),
"geo_accession" : [
"GSE83592"
],
"title" : [
"JQ1 +/- Vemurafenib in BRAF mutant melanoma (A375)"
],
"summary" : [
"The apoptotic genes significantly down-regulated."
],
"num_samples" : 2,
"overall_design" : [
"dsf"
],
"samples" : {
"GSM2210563" : {
"title" : "16L",
"source_name_ch1" : "A375 cell line",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
"GSM2210564" : {
"title" : "16R",
"source_name_ch1" : "A375 cell line",
"organism_ch1" : "Homo sapiens",
"library_strategy" : "RNA-Seq"
},
},
"geo_signal" : {}
}
]
最好使用python。感谢。
答案 0 :(得分:0)
将正则表达式处理用作有效的JSON。
import re
import json
def replaceFun(match):
return match.group(1)
f=open("file/test.json","r")
str=f.read()
pattern=re.compile(r",\s*}",re.M)
newstr=pattern.sub(r"\n}",str)
pattern=re.compile(r"ObjectId\((.+)\)",re.M)
newstr=pattern.sub(replaceFun,newstr)
f.close()
newdict=json.loads(newstr)
for v in newdict:
print(v)