通过在线压缩CSV

时间:2018-04-02 07:10:04

标签: python json python-2.7 csv itertools

我对Python很陌生,我需要在python 2.7中使用标准库 从在线压缩的CSV文件中创建嵌套JSON。我已经找到了访问和解压缩文件但是在解析时遇到了一些麻烦。基本上,我需要为每个主键创建一个包含三个高级元素的JSON输出:

  1. 主键(由第0,2,3和4列组成)
  2. 字典,是该PK的观察值的时间序列(即:日期:观察值)
  3. 元数据字典(产品,流式,单位,以及理想情况下每个观察点质量的嵌套时间序列。

    from StringIO import StringIO 
    from urllib import urlopen 
    from zipfile
    import ZipFile from datetime 
    import datetime import itertools as it 
    import csv 
    import sys
    
    url = urlopen("https://www.jodidata.org/_resources/files/downloads/gas-data/jodi_gas_csv_beta.zip")
    myzip = ZipFile(StringIO(url.read())) 
    with myzip.open('jodi_gas_beta.csv','r' ) as myCSV:         
        #Read the data  
        reader=csv.DictReader(myCSV)
        #Sort the data by PK + Time for timeseries
        reader=sorted(reader,key=lambda row: row['REF_AREA'],row['ENERGY_PRODUCT'],row['FLOW_BREAKDOWN'],row['UNIT_MEASURE'],row['TIME_PERIOD']))
    
    #initialize dictionaries for output
    myData=[] 
    keys=[] 
    groups=[] 
    
    #limiting to first 200 rows for testing ONLY
    for k, g in it.groupby(list(it.islice(reader,200)),key=lambda row: row['REF_AREA'],row['ENERGY_PRODUCT'],row['FLOW_BREAKDOWN'],row['UNIT_MEASURE'])):
        keys.append(k)
        groups.append(list(g))
        myData.append({'MyPK': ''.join(k),  #captures the PKs
            'TimeSeries' : dict((zip(e['TIME_PERIOD'],e['OBS_VALUE']))) for e in g], #Not working properly, want a time series dictionary here
            #TODO: Dictionary of metadata here (with nested time series, if possible)})
    
    #TODO: Output as a JSON string
    
  4. 所以,结果看起来应该是这样的:

    {
        "myPK": "AENATGASEXPLNGM3",
        "TimeSeries":[
          ["2015-01", 756],
          ["2015-02", 572],
          ["2015-03", 654]
        ],
        "Metadata":{
          "Country":"AE",
          "Product":"NATGAS",
          "Flow":"EXPLNG",
          "Unit":"M3",
          "Quality:[
              ["2015-01", 3],
              ["2015-02", 3],
              ["2015-03", 3]
            ]
        }
    }
    

1 个答案:

答案 0 :(得分:0)

虽然您似乎没有花费太多精力自己解决问题,但我认为这就是您想要的。它利用operator.itemgetter()函数简化从各种容器中检索一系列不同的项目(例如' list s and dict`s)。

我还修改了代码,使其更加贴近PEP 8 - Style Guide for Python Code

import datetime
import csv
from operator import itemgetter
import itertools as it
import json
from StringIO import StringIO
import sys
from urllib import urlopen
from zipfile import ZipFile

# Utility.
def typed_itemgetter(items, callables):
    """ Like operator.itemgetter() but also applies corresponding callable to
        each retrieved value if it's not None. Creates and returns a function.
    """
    return lambda row: [f(value) if f else value
                            for value, f in zip(itemgetter(*items)(row), callables)]

url = urlopen("https://www.jodidata.org/_resources/files/downloads/gas-data/jodi_gas_csv_beta.zip")
myzip = ZipFile(StringIO(url.read()))
with myzip.open('jodi_gas_beta.csv', 'r' ) as myCSV:
    reader = csv.DictReader(myCSV)
    primary_key = itemgetter('REF_AREA', 'ENERGY_PRODUCT', 'FLOW_BREAKDOWN', 'UNIT_MEASURE',
                             'TIME_PERIOD')
    reader = sorted(reader, key=primary_key)

# Limit to first 200 rows for TESTING.
reader = [row for row in it.islice(reader, 200)]

# Group the data by designated keys (aka "primary key").
keys, groups = [], []
keyfunc = itemgetter('REF_AREA', 'ENERGY_PRODUCT', 'FLOW_BREAKDOWN', 'UNIT_MEASURE')
for k, g in it.groupby(reader, key=keyfunc):
    keys.append(k)
    groups.append(list(g))

# Create corresponding JSON-like Python data-structure.
myData = []
for i, group in enumerate(groups):
    result = {'myPK': ''.join(keys[i]),
              'TimeSeries': [
                    typed_itemgetter(('TIME_PERIOD', 'OBS_VALUE'),
                                     (None, lambda x: int(float(x))))(row)
                        for row in group]
             }
    metadata = dict(zip(("Country", "Product", "Flow", "Unit"), keys[i]))
    metadata['Quality'] = [typed_itemgetter(
                            ('TIME_PERIOD', 'ASSESSMENT_CODE'), (None, int))(row)
                                for row in group]
    result['Metadata'] = metadata
    myData.append(result)

# Display the data to be turned into JSON.
from pprint import pprint
print('myData:')
pprint(myData)

# To create JSON format output, use something like:
import json
with open('myData.json', 'w') as fp:
    json.dump(myData, fp, indent=2)

打印输出的开头部分:

myData:
[{'Metadata': {'Country': 'AE',
               'Flow': 'EXPLNG',
               'Product': 'NATGAS',
               'Quality': [['2015-01', 3],
                           ['2015-02', 3],
                           ['2015-03', 3],
                           ['2015-04', 3],
                           ['2015-05', 3],
                           ['2015-06', 3],
                           ['2015-07', 3],
                           ['2015-08', 3],
                           ['2015-09', 3],
                           ['2015-10', 3],
                           ['2015-11', 3],
                           ['2015-12', 3],
                           ['2016-01', 3],
                           ['2016-02', 3],
                           ['2016-04', 3],
                           ['2016-05', 3]],
               'Unit': 'M3'},
  'TimeSeries': [['2015-01', 756],
                 ['2015-02', 572],
                 ['2015-03', 654],
                 ['2015-04', 431],
                 ['2015-05', 681],
                 ['2015-06', 683],
                 ['2015-07', 751],
                 ['2015-08', 716],
                 ['2015-09', 830],
                 ['2015-10', 580],
                 ['2015-11', 659],
                 ['2015-12', 659],
                 ['2016-01', 742],
                 ['2016-02', 746],
                 ['2016-04', 0],
                 ['2016-05', 0]],
  'myPK': 'AENATGASEXPLNGM3'},
 {'Metadata': {'Country': 'AE',
               'Flow': 'EXPPIP',
               'Product': 'NATGAS',
               'Quality': [['2015-01', 3],
                           ['2015-02', 3],
                           ['2015-03', 3],
                           ['2015-04', 3],
                           ['2015-05', 3],
                           ['2015-06', 3],
                           ['2015-07', 3],
                           ['2015-08', 3],
                           ['2015-09', 3],
                           ['2015-10', 3],
                           ['2015-11', 3],
                           ['2015-12', 3],
                           ['2016-01', 3],
                           ['2016-02', 3],
                           ['2016-03', 3],
                           ['2016-04', 3],
    # etc, etc...
]