我发现xarray将数据集保存到netCDF非常慢。我怀疑这是因为.to_netcdf()命令首先必须在保存之前加载数据。举个例子,我得到以下时间:
示例a)
ds.to_netcdf(file_path) # ~6 minutes
例b)
ds.load() # ~6 minutes
ds.to_netcdf(file_path) # <1 second
似乎减速来自装载。有没有办法绕过这种负荷或加速这个过程?
我正在使用的(a)的明确例子如下:
import xarray as xr
import numpy as np
from Python import config, read
from Python import array_opperations as ao
class Model(read.Read):
def __init__(self, case, mean=1):
self.readPath = config.readPath(case)
self.load_dataset(['Vorticity.nc', 'ssh.nc', 'vels_snap.nc', 'hFac.nc'],
['DXC', 'DYC', 'DXG', 'DYG',]) #load a dataset using the xarray open_dataset functionality
self.ds = self.ds.sel(TIME=slice('1980-01-08', '1981-01-07'))
self.KE(mean=mean)
def KE(self, mean=1):
'''
Calculates the kinetic energy term of the momentum budget according
to KEscheme=0
'''
u = self.ds['UVEL']
v = self.ds['VVEL']
def calc(u, v):
u2 = u**2
v2 = v**2
KE = 0.25 * ( u2 + ao.roll(u2, 'LONGITUDE', -1) +
v2 + ao.roll(v2, 'LATITUDE', -1) )
KEx = - ao.padded_diff(KE, 'LONGITUDE') / self.ds['DXC']
KEy = - ao.padded_diff(KE, 'LATITUDE') / self.ds['DYC']
return KEx, KEy
if mean:
u_mean = u.mean(dim='TIME', skipna=True)
v_mean = v.mean(dim='TIME', skipna=True)
u_rey = u - u_mean
v_rey = v - v_mean
KEx, KEy = calc(u_mean, v_mean)
KExRey, KEyRey = calc(u_rey, v_rey)
KE_terms = [KEx.rename('u_gradKE'), KEy.rename('v_gradKE'),
KExRey.rename('u_gradKERey'), KEyRey.rename('v_gradKERey')]
else:
KEx, KEy = calc(u, v)
KE_terms = [KEx.rename('u_gradKE'), KEy.rename('v_gradKE')]
KE_nc = xr.merge(KE_terms)
KE_nc.to_netcdf(self.readPath + 'offlineKE.nc', unlimited_dims='TIME')
由于xarray功能,KE功能可以快速运行,但是一旦到达最后一个(.to_netcdf)线,它就会非常滞后。
这是我的数据集加载方法:
def load_dataset(self, nc_files, data_files):
'''
Load all NetCDF files listed
'''
file_paths = [self.readPath + file for file in nc_files]
self.ds = xr.open_mfdataset(file_paths, chunks={'LATITUDE':200, 'LONGITUDE':200})
LAT = self.ds.coords['LATITUDE'].values
LON = self.ds.coords['LONGITUDE'].values
for file in data_files:
_, x, y, z, _, prec = self.readMeta(self.readPath + file + '.meta')
data = self.readBin(file + '.data', x=x, y=y, z=z, dtype=prec)
self.ds = xr.merge([ self.ds,
xr.DataArray(data,
coords=[LAT,LON], dims=['LATITUDE','LONGITUDE']
).to_dataset(name=file)] )
self.ds = xr.merge([ self.ds,
xr.DataArray(np.ones(self.ds['DXC'].shape)*5000.0,
coords=[LAT,LON], dims=['LATITUDE','LONGITUDE']
).to_dataset(name='DRF')] )