Question

我创建了一个课程来保存我的研究实验结果（我是一名EE博士生），如

class Trial: def __init__(self, subID, triID): self.filePath = '' # file path of the folder self.subID = -1 # int self.triID = -1 # int self.data_A = -1 # numpy array self.data_B = -1 # numpy array ......

它是许多bool，int和numpy数组的混合体。你明白了。我读到如果数据是hdf5格式，加载时会更快。我可以使用我的数据，这是我的Trial对象的python列表吗？

请注意，stackoverflow上有similar question。但它只有一个答案，但没有回答这个问题。相反，它将OP的自定义类分解为基本数据类型并将它们存储到单个数据集中。我不反对这样做，但我想知道它是否是唯一的方式，因为它违背了面向对象的哲学。

Answer 1

我尚未测试以下解决方案的速度和存储效率。 HDF5确实支持可与numpy“结构化数组”结合使用的“复合数据类型”，该数组支持混合变量类型，例如在类对象中遇到的类型。

"""
Created on Tue Dec 10 21:26:54 2019

@author: Christopher J. Burke
Give a worked example of saving a list of class objects with mixed
storage types to a HDF5 file and reading in file back to a list of class
objects.  The solution is inspired by this bug report
https://github.com/h5py/h5py/issues/735
and the numpy and hdf5 documentation
"""

import numpy as np
import h5py

class test_object:
    """ Define a storage class that keeps info that we want to record
      for every object
    """
    # explictly state the name, datatype and shape for every
    #  class variable
    #  The names MUST exactly match the class variable names in the __init__
    store_names = ['a', 'b', 'c', 'd', 'e']
    store_types = ['i8', 'i4', 'f8', 'S80', 'f8']
    store_shapes = [None, None, None, None, [4]]
    # Make the tuples that will define the numpy structured array
    # https://docs.scipy.org/doc/numpy/user/basics.rec.html
    sz = len(store_names)
    store_def_tuples = []
    for i in range(sz):
        if store_shapes[i] is not None:
            store_def_tuples.append((store_names[i], store_types[i], store_shapes[i]))
        else:
            store_def_tuples.append((store_names[i], store_types[i]))
    # Actually define the numpy structured/compound data type
    store_struct_numpy_dtype = np.dtype(store_def_tuples)

    def __init__(self):
        self.a = 0
        self.b = 0
        self.c = 0.0
        self.d = '0'
        self.e = [0.0, 0.0, 0.0, 0.0]

    def store_objlist_as_hd5f(self, objlist, fileName):
        """Function to save the class structure into hdf5
        objlist -  is a list of the test_objects
        fileName - is the h5 filename for output
        """        
        # First create the array of numpy structered arrays
        np_dset = np.ndarray(len(objlist), dtype=self.store_struct_numpy_dtype)
        # Convert the class variables into the numpy structured dtype
        for i, curobj in enumerate(objlist):
            for j in range(len(self.store_names)):
                np_dset[i][self.store_names[j]] = getattr(curobj, self.store_names[j])
        # Data set should be all loaded ready to write out
        fp = h5py.File(fileName, 'w')
        hf_dset = fp.create_dataset('dset', shape=(len(objlist),), dtype=self.store_struct_numpy_dtype)
        hf_dset[:] = np_dset
        fp.close()

    def fill_objlist_from_hd5f(self, fileName):
        """ Function to read in the hdf5 file created by store_objlist_as_hdf5
          and store the contents into a list of test_objects
          fileName - si the h5 filename for input
         """
        fp = h5py.File(fileName, 'r')
        np_dset = np.array(fp['dset'])
        # Start with empty list
        all_objs = []
        # iterate through the numpy structured array and save to objects
        for i in range(len(np_dset)):
            tmp = test_object()
            for j in range(len(self.store_names)):
                setattr(tmp, self.store_names[j], np_dset[i][self.store_names[j]])
            # Append object to list
            all_objs.append(tmp)
        return all_objs

if __name__ == '__main__':

    all_objs = []    
    for i in range(3):
        # instantiate tce_seed object
        tmp = test_object()
        # Put in some dummy data into object
        tmp.a = int(i)
        tmp.b = int(i)
        tmp.c = float(i)
        tmp.d = '{0} {0} {0} {0}'.format(i)
        tmp.e = np.full([4], i, dtype=np.float)
        all_objs.append(tmp)

    # Write out hd5 file
    tmp.store_objlist_as_hd5f(all_objs, 'test_write.h5')

    # Read in hd5 file
    all_objs = []
    all_objs = tmp.fill_objlist_from_hd5f('test_write.h5')

    # verify the output is as expected
    for i, curobj in enumerate(all_objs):
        print('Object {0:d}'.format(i))
        print('{0:d} {1:d} {2:f}'.format(curobj.a, curobj.b, curobj.c))
        print('{0} {1}'.format(curobj.d.decode('ASCII'), curobj.e))

Answer 2

这是我用来保存这类数据的小班级。您可以通过执行类似的操作来使用它。

dc = DataContainer()
dc.trials = <your list of trial objects here>
dc.save('mydata.pkl')

然后加载do ..

dc = DataContainer.load('mydata.pkl')

这是DataContainer文件：

import gzip
import cPickle as pickle

# Simple container with load and save methods.  Declare the container
# then add data to it.  Save will save any data added to the container.
# The class automatically gzips the file if it ends in .gz
#
# Notes on size and speed (using UbuntuDialog data)
#       pkl     pkl.gz
# Save  11.4s   83.7s
# Load   4.8s   45.0s
# Size  596M    205M
#
class DataContainer(object):
    @staticmethod
    def isGZIP(filename):
        if filename.split('.')[-1] == 'gz':
            return True
        return False

    # Using HIGHEST_PROTOCOL is almost 2X faster and creates a file that
    # is ~10% smaller.  Load times go down by a factor of about 3X.
    def save(self, filename='DataContainer.pkl'):
        if self.isGZIP(filename):
            f = gzip.open(filename, 'wb')
        else:
            f = open(filename, 'wb')
        pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    # Note that loading to a string with pickle.loads is about 10% faster
    # but probaly comsumes a lot more memory so we'll skip that for now.
    @classmethod
    def load(cls, filename='DataContainer.pkl'):
        if cls.isGZIP(filename):
            f = gzip.open(filename, 'rb')
        else:
            f = open(filename, 'rb')
        n = pickle.load(f)
        f.close()
        return n

根据您的使用情况，您可以使用顶部所描述的，作为基类，或者只是将pickle.dump行复制到您的代码中。

如果您确实拥有大量数据并且在每次运行测试程序时都没有使用所有数据，那么还有一些其他选项，例如数据库，但上面是关于假设您的最佳简单选项每次运行都需要大部分数据。

如何将我自己的类对象存储到hdf5中？

2 个答案: