我正在python
编写一个脚本来递归遍历每个文件,从文件中创建一个JSON对象,如下所示:
target_id length eff_length est_counts tpm
ENST00000619216.1 68 33.8839 2.83333 4.64528
ENST00000473358.1 712 428.88 0 0
ENST00000469289.1 535 306.32 0 0
ENST00000607096.1 138 69.943 0 0
ENST00000417324.1 1187 844.464 0 0
ENST00000461467.1 590 342.551 3.44007 0.557892
ENST00000335137.3 918 588.421 0 0
ENST00000466430.5 2748 2405.46 75.1098 1.73463
ENST00000495576.1 1319 976.464 11.1999 0.637186
这是我的剧本:
import glob
import os
import json
# define datasets
# Dataset name
datasets = ['pnoc']
# open file in append mode
f = open('mydict','a')
# define a new object
data={}
# traverse through folders of datasets
for d in datasets:
samples = glob.glob(d + "/data" + "/*.tsv")
for s in samples:
# get the SampleName without extension and path
fname = os.path.splitext(os.path.basename(s))[0]
# split the basename to get sample name and norm method
sname, keyword, norm = fname.partition('.')
# determing Normalization method based on filename
if norm == "abundance":
norm = "kallisto"
elif norm == "rsem_genes.results":
norm = "rsem_genes"
else:
norm = "rsem_isoforms"
# read each file
with open(s) as samp:
next(samp)
for line in samp:
sp = line.split('\t')
data.setdefault(sname,[]).append({"ID": sp[0],"Expression": sp[4]})
json.dump(data, f)
f.close()
我想在以下几行中使用JSON对象:
# 20000 Sample names, 3 Normalization methods and 60000 IDs in each file.
DatasetName1 {
SampleName1 {
Type {
Normalization1 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
},
Normalization2 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
},
Normalization3 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
}
}
},
SampleName2 {
Type {
Normalization1 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
},
Normalization2 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
},
Normalization3 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
}
}
},
...
SampleName20000{
Type {
Normalization1 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
},
Normalization2 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
},
Normalization3 {
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
}
}
}
}
所以我的问题是 - 将文本文件转换为JSON时,如何在JSON输出中设置级别?
谢谢!
答案 0 :(得分:0)
首先,您应该使用defaultdict
而不是一遍又一遍地设置默认值。
其次,我认为你提出的结构是关闭的,你应该在(类似JSON结构)中使用一些数组:
{
DatasetName1: {
SampleName1: {
Type: {
Normalization1: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
],
Normalization2: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
],
Normalization3: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
]
}
},
SampleName2: {
Type: {
Normalization1: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
],
Normalization2: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
],
Normalization3: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
]
}
},
...
SampleName20000: {
Type: {
Normalization1: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
],
Normalization2: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
],
Normalization3: [
{ ID1: value, Expression: value },
{ ID2: value, Expression: value },
...
{ ID60000: value, Expression: value }
]
}
}
}, {
DatasetName2: {
...
}, ...
}
所以你得到的代码(未经测试)应该是这样的(只要你的规范方法逻辑是正确的):
from glob import glob
from os import path
from json import dump
from collections import defaultdict
# define datasets, and result dict
datasets, results = ['pnoc'], defaultdict(dict)
# open file in append mode
with open('mydict','a') as f:
# traverse through folders of datasets
for d in datasets:
for s in glob(d + "/data" + "/*.tsv"):
sample = {"Type": defaultdict(list)}
# get the basename without extension and path
fname = path.splitext(path.basename(s))[0]
# split the basename to get sample name and norm method
sname, keyword, norm = fname.partition('.')
# determing norm method based on filename
if norm == "abundance":
norm = "kallisto"
elif norm == "rsem_genes.results":
norm = "rsem_genes"
else:
norm = "rsem_isoforms"
# read each file
with open(s) as samp:
next(samp) # Skip first line of file
# Loop through each line and extract the ID and TPM
for (id, _, __, ___, tpm) in (line.split('\t') for line in samp):
# Add this line to the list for respective normalization method
sample['Type'][norm].append({"ID": id, "Expression": float(tpm)})
# Add sample to dataset
results[d][sname] = sample
dump(results, f)
这将以JSON格式保存结果。