Question

我发现xarray将数据集保存到netCDF非常慢。我怀疑这是因为.to_netcdf（）命令首先必须在保存之前加载数据。举个例子，我得到以下时间：

示例a）

ds.to_netcdf(file_path) # ~6 minutes

例b）

ds.load() # ~6 minutes
ds.to_netcdf(file_path) # <1 second

似乎减速来自装载。有没有办法绕过这种负荷或加速这个过程？

我正在使用的（a）的明确例子如下：

import xarray            as xr
import numpy             as np
from Python import config, read
from Python import array_opperations as ao

class Model(read.Read):

    def __init__(self, case, mean=1):
        self.readPath = config.readPath(case)
        self.load_dataset(['Vorticity.nc', 'ssh.nc', 'vels_snap.nc', 'hFac.nc'],
                          ['DXC', 'DYC', 'DXG', 'DYG',]) #load a dataset using the xarray open_dataset functionality
        self.ds = self.ds.sel(TIME=slice('1980-01-08', '1981-01-07'))
        self.KE(mean=mean)


    def KE(self, mean=1):
        '''
        Calculates the kinetic energy term of the momentum budget according
        to KEscheme=0
        '''
        u = self.ds['UVEL']
        v = self.ds['VVEL']

        def calc(u, v):
            u2 = u**2
            v2 = v**2

            KE = 0.25 * ( u2 + ao.roll(u2, 'LONGITUDE', -1) +
                          v2 + ao.roll(v2, 'LATITUDE',  -1) )

            KEx = - ao.padded_diff(KE, 'LONGITUDE') / self.ds['DXC']
            KEy = - ao.padded_diff(KE, 'LATITUDE')  / self.ds['DYC']
            return KEx, KEy

        if mean:
            u_mean = u.mean(dim='TIME', skipna=True)
            v_mean = v.mean(dim='TIME', skipna=True)

            u_rey = u - u_mean
            v_rey = v - v_mean

            KEx, KEy = calc(u_mean, v_mean)
            KExRey, KEyRey = calc(u_rey, v_rey)

            KE_terms = [KEx.rename('u_gradKE'), KEy.rename('v_gradKE'),
                  KExRey.rename('u_gradKERey'), KEyRey.rename('v_gradKERey')]
        else:
            KEx, KEy = calc(u, v)
            KE_terms = [KEx.rename('u_gradKE'), KEy.rename('v_gradKE')]

        KE_nc = xr.merge(KE_terms)
        KE_nc.to_netcdf(self.readPath + 'offlineKE.nc', unlimited_dims='TIME')

由于xarray功能，KE功能可以快速运行，但是一旦到达最后一个（.to_netcdf）线，它就会非常滞后。

这是我的数据集加载方法：

def load_dataset(self, nc_files, data_files):
    '''
    Load all NetCDF files listed
    '''
    file_paths = [self.readPath + file for file in nc_files]
    self.ds = xr.open_mfdataset(file_paths, chunks={'LATITUDE':200, 'LONGITUDE':200})
    LAT = self.ds.coords['LATITUDE'].values
    LON = self.ds.coords['LONGITUDE'].values

    for file in data_files:
        _, x, y, z, _, prec = self.readMeta(self.readPath + file + '.meta')
        data = self.readBin(file + '.data', x=x, y=y, z=z, dtype=prec)
        self.ds = xr.merge([ self.ds,
                             xr.DataArray(data,
                  coords=[LAT,LON], dims=['LATITUDE','LONGITUDE']
                          ).to_dataset(name=file)] )

    self.ds = xr.merge([ self.ds,
                         xr.DataArray(np.ones(self.ds['DXC'].shape)*5000.0,
               coords=[LAT,LON], dims=['LATITUDE','LONGITUDE']
                       ).to_dataset(name='DRF')] )

xarray缓慢保存netCDF

0 个答案: