Question

我是Python / JSON的新手，所以请耐心等待。我可以在R中执行此操作但我们需要使用Python以便将其转换为Python / Spark / MongoDB。另外，我只是发布一个最小的子集 - 我有更多的文件类型，所以如果有人可以帮助我，我可以在此基础上集成更多的文件和文件类型：

回到我的问题：

我有两个tsv输入文件，我需要合并并转换为JSON。这两个文件都有基因和样本列以及一些其他列。但是，gene和sample可能会或可能不会像我所示的那样重叠 - f2.tsv包含f1.tsv中的所有基因，但也有一个额外的基因g3。同样，这两个文件在sample列中都有重叠和非重叠的值。

# f1.tsv – has gene, sample and additional column other1

$ cat f1.tsv 
gene    sample  other1
g1      s1      a1
g1      s2      b1
g1      s3a     c1
g2      s4      d1

# f2.tsv – has gene, sample and additional columns other21, other22

$ cat f2.tsv 
gene    sample  other21 other22
g1      s1      a21     a22
g1      s2      b21     b22
g1      s3b     c21     c22
g2      s4      d21     d22
g3      s5      f21     f22

基因形成顶层，每个基因有多个样本，形成第二个层次，附加列形成extras，这是第三个层次。附加内容分为两个因为一个文件有other1，第二个文件有other21和other22。我稍后将包含的其他文件将包含其他字段，如other31和other32等等，但它们仍会包含基因和样本列。

# expected output – JSON by combining both tsv files. 
$ cat output.json 
[{
  "gene":"g1",
  "samples":[
    {
      "sample":"s2",
      "extras":[
        {
          "other1":"b1"
        },
        {
          "other21":"b21",
          "other22":"b22"
        }
      ]
    },
    {
      "sample":"s1",
      "extras":[
        {
          "other1":"a1"
        },
        {
          "other21":"a21",
          "other22":"a22"
        }
      ]
    },
    {
      "sample":"s3b",
      "extras":[
        {
          "other21":"c21",
          "other22":"c22"
        }
      ]
    },
    {
      "sample":"s3a",
      "extras":[
        {
          "other1":"c1"
        }
      ]
    }
  ]
},{
  "gene":"g2",
  "samples":[
    {
      "sample":"s4",
      "extras":[
        {
          "other1":"d1"
        },
        {
          "other21":"d21",
          "other22":"d22"
        }
      ]
    }
  ]
},{
  "gene":"g3",
  "samples":[
    {
      "sample":"s5",
      "extras":[
        {
          "other21":"f21",
          "other22":"f22"
        }
      ]
    }
  ]
}]

如何根据两个常见列将两个csv文件转换为单个多级JSON？

我真的很感激我能得到的任何帮助。

谢谢！

Answer 1

这似乎是pandas的问题！不幸的是，熊猫只带我们到目前为止，然后我们必须自己做一些操作。这既不是快速也不是特别有效的代码，但它将完成工作。

import pandas as pd
import json
from collections import defaultdict

# here we import the tsv files as pandas df
f1 = pd.read_table('f1.tsv', delim_whitespace=True)
f2 = pd.read_table('f2.tsv', delim_whitespace=True)

# we then let pandas merge them
newframe = f1.merge(f2, how='outer', on=['gene', 'sample'])

# have pandas write them out to a json, and then read them back in as a
# python object (a list of dicts)
pythonList = json.loads(newframe.to_json(orient='records'))


newDict = {}
for d in pythonList:
    gene = d['gene']
    sample = d['sample']
    sampleDict = {'sample':sample,
                  'extras':[]}

    extrasdict = defaultdict(lambda:dict())

    if gene not in newDict:
        newDict[gene] = {'gene':gene, 'samples':[]}

    for key, value in d.iteritems():
        if 'other' not in key or value is None:
            continue
        else:
            id = key.split('other')[-1]
            if len(id) == 1:
                extrasdict['1'][key] = value
            else:
                extrasdict['{}'.format(id[0])][key] = value

    for value in extrasdict.values():
        sampleDict['extras'].append(value)

    newDict[gene]['samples'].append(sampleDict)

newList = [v for k, v in newDict.iteritems()]

print json.dumps(newList)

如果这看起来像是一个适合您的解决方案，我很乐意花一些时间来清理它，使其更具可读性和效率。

PS：如果你喜欢R，那么pandas是要走的路（它是为python中的数据提供类似R的接口而写的）

Answer 2

这是另一种选择。当你开始添加更多文件时，我试图让它易于管理。您可以在命令行上运行并提供参数，每个参数对应要添加的每个文件。基因/样本名称存储在字典中以提高效率。所需JSON对象的格式化在每个类中完成。 format（）方法。希望这会有所帮助。

import csv, json, sys

class Sample(object):
    def __init__(self, name, extras):
        self.name = name
        self.extras = [extras]

    def format(self):
        map = {}
        map['sample'] = self.name
        map['extras'] = self.extras
        return map

    def add_extras(self, extras):
        #edit 8/20
        #always just add the new extras to the list
        for extra in extras:
            self.extras.append(extra)

class Gene(object):
    def __init__(self, name, samples):
        self.name = name
        self.samples = samples

    def format(self):
        map = {}
        map ['gene'] = self.name
        map['samples'] = sorted([self.samples[sample_key].format() for sample_key in self.samples], key=lambda sample: sample['sample'])
        return map

    def create_or_add_samples(self, new_samples):
        # loop through new samples, seeing if they already exist in the gene object
        for sample_name in new_samples:
            sample = new_samples[sample_name]
            if sample.name in self.samples:
                self.samples[sample.name].add_extras(sample.extras)
            else:
                self.samples[sample.name] = sample

class Genes(object):
    def __init__(self):
        self.genes = {}

    def format(self):
        return sorted([self.genes[gene_name].format() for gene_name in self.genes], key=lambda gene: gene['gene'])

    def create_or_add_gene(self, gene):
        if not gene.name in self.genes:
            self.genes[gene.name] = gene
        else:
            self.genes[gene.name].create_or_add_samples(gene.samples)

def row_to_gene(headers, row):
    gene_name = ""
    sample_name = ""
    extras = {}
    for value in enumerate(row):
        if headers[value[0]] == "gene":
            gene_name = value[1]
        elif headers[value[0]] == "sample":
            sample_name = value[1]
        else:
            extras[headers[value[0]]] = value[1]
    sample_dict = {}
    sample_dict[sample_name] = Sample(sample_name, extras)
    return Gene(gene_name, sample_dict)

if __name__ == '__main__':
    delim = "\t"
    genes = Genes()
    files = sys.argv[1:]

    for file in files:
        print("Reading " + str(file))
        with open(file,'r') as f1:
            reader = csv.reader(f1, delimiter=delim)
            headers = []
            for row in reader:
                if len(headers) == 0:
                    headers = row
                else:
                    genes.create_or_add_gene(row_to_gene(headers, row))

    result = json.dumps(genes.format(), indent=4)
    print(result)
    with open('json_output.txt', 'w') as output:
        output.write(result)

Answer 3

分步进行：

阅读传入的tsv文件，并将来自不同基因的信息汇总到字典中。
处理所说的词典以匹配您想要的格式。
将结果写入JSON文件。

以下是代码：

import csv
import json
from collections import defaultdict

input_files = ['f1.tsv', 'f2.tsv']
output_file = 'genes.json'

# Step 1
gene_dict = defaultdict(lambda: defaultdict(list))
for file in input_files:
    with open(file, 'r') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for line in reader:
            gene = line.pop('gene')
            sample = line.pop('sample')
            gene_dict[gene][sample].append(line)

# Step 2
out = [{'gene': gene,
        'samples': [{'sample': sample, 'extras': extras}
                    for sample, extras in samples.items()]}
       for gene, samples in gene_dict.items()]

# Step 3
with open(output_file, 'w') as f:
    json.dump(out, f)

Python：将两个CSV文件合并为多级JSON

3 个答案: