Python:转换文本文件多级JSON

时间:2016-08-10 16:45:52

标签: python json

我正在python编写一个脚本来递归遍历每个文件,从文件中创建一个JSON对象,如下所示:

target_id   length  eff_length  est_counts  tpm
ENST00000619216.1   68  33.8839 2.83333 4.64528
ENST00000473358.1   712 428.88  0   0
ENST00000469289.1   535 306.32  0   0
ENST00000607096.1   138 69.943  0   0
ENST00000417324.1   1187    844.464 0   0
ENST00000461467.1   590 342.551 3.44007 0.557892
ENST00000335137.3   918 588.421 0   0
ENST00000466430.5   2748    2405.46 75.1098 1.73463
ENST00000495576.1   1319    976.464 11.1999 0.637186

这是我的剧本:

import glob
import os
import json

# define datasets
# Dataset name
datasets = ['pnoc']

# open file in append mode
f = open('mydict','a')

# define a new object
data={}

# traverse through folders of datasets
for d in datasets:
    samples = glob.glob(d + "/data"  + "/*.tsv")
    for s in samples:
        # get the SampleName without extension and path
        fname = os.path.splitext(os.path.basename(s))[0]

        # split the basename to get sample name and norm method
        sname, keyword, norm = fname.partition('.')

        # determing Normalization method based on filename
        if norm == "abundance":
            norm = "kallisto"
        elif norm == "rsem_genes.results":
            norm = "rsem_genes"
        else:
            norm = "rsem_isoforms"

        # read each file
        with open(s) as samp:
            next(samp)
            for line in samp:
                sp = line.split('\t')
                data.setdefault(sname,[]).append({"ID": sp[0],"Expression": sp[4]})
                json.dump(data, f)
f.close()

我想在以下几行中使用JSON对象:

# 20000 Sample names, 3 Normalization methods and 60000 IDs in each file.
DatasetName1 {
    SampleName1 {
        Type {
            Normalization1 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            },
            Normalization2 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            },
            Normalization3 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            }
        }   
    },
    SampleName2 {
        Type {
            Normalization1 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            },
            Normalization2 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            },
            Normalization3 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            }
        }   
    },
    ...
    SampleName20000{
        Type {
            Normalization1 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            },
            Normalization2 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            },
            Normalization3 {
                { ID1: value, Expression: value },
                { ID2: value, Expression: value },
                ...
                { ID60000: value, Expression: value }
            }
        }
    }
}

所以我的问题是 - 将文本文件转换为JSON时,如何在JSON输出中设置级别?

谢谢!

1 个答案:

答案 0 :(得分:0)

首先,您应该使用defaultdict而不是一遍又一遍地设置默认值。

其次,我认为你提出的结构是关闭的,你应该在(类似JSON结构)中使用一些数组:

{
    DatasetName1: {
        SampleName1: {
            Type: {
                Normalization1: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ],
                Normalization2: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ],
                Normalization3: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ]
            }
        },
        SampleName2: {
            Type: {
                Normalization1: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ],
                Normalization2: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ],
                Normalization3: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ]
            }
        },
        ...
        SampleName20000: {
            Type: {
                Normalization1: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ],
                Normalization2: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ],
                Normalization3: [
                    { ID1: value, Expression: value },
                    { ID2: value, Expression: value },
                    ...
                    { ID60000: value, Expression: value }
                ]
            }
        }
    }, {
    DatasetName2: {
        ...
    }, ...
}

所以你得到的代码(未经测试)应该是这样的(只要你的规范方法逻辑是正确的):

from glob import glob
from os import path
from json import dump
from collections import defaultdict

# define datasets, and result dict
datasets, results = ['pnoc'], defaultdict(dict)

# open file in append mode
with open('mydict','a') as f:
    # traverse through folders of datasets
    for d in datasets:
        for s in glob(d + "/data"  + "/*.tsv"):
            sample = {"Type": defaultdict(list)}

            # get the basename without extension and path
            fname = path.splitext(path.basename(s))[0]

            # split the basename to get sample name and norm method
            sname, keyword, norm = fname.partition('.')

            # determing norm method based on filename
            if norm == "abundance":
                norm = "kallisto"
            elif norm == "rsem_genes.results":
                norm = "rsem_genes"
            else:
                norm = "rsem_isoforms"

            # read each file
            with open(s) as samp:
                next(samp)              # Skip first line of file

                # Loop through each line and extract the ID and TPM
                for (id, _, __, ___, tpm) in (line.split('\t') for line in samp):
                    # Add this line to the list for respective normalization method
                    sample['Type'][norm].append({"ID": id, "Expression": float(tpm)})
            # Add sample to dataset
            results[d][sname] = sample
    dump(results, f)

这将以JSON格式保存结果。