首先,感谢您为dask提供了所有功能,这是高度赞赏的!
但是,使用dask.distributed
在栅格化数据集上运行SVD时,虽然大多数数据集确实包含正确的值,但是当仅单个块仅包含NaN值时,似乎好像失败了。
我使用xarray.open_mfdataset(chunks={...})
读取了一个数据集,并尝试设置块大小,以便eofs.xarray包中使用的SVD计算(dask.array.linalg
)利用了我们的集群提供的核心,使用dask.distributed
客户端。
<xarray.Dataset>
Dimensions: (time: 8760, x: 1000, y: 840)
Coordinates:
* x (x) float64 2.452e+06 2.458e+06 2.462e+06 ... 7.442e+06 7.448e+06
* y (y) float64 1.352e+06 1.358e+06 1.362e+06 ... 5.542e+06 5.548e+06
* time (time) datetime64[ns] 2005-01-01 ... 2005-12-31T23:00:00
Data variables:
capacity (y, x) float64 dask.array<shape=(840, 1000), chunksize=(840, 840)>
capfac (time, y, x) float32 dask.array<shape=(8760, 840, 1000), chunksize=(876, 840, 840)>
但是,当我运行计算时,它失败并显示以下错误消息。
ValueError: error encountered in SVD, check that missing values are in the same places at each time and that all the values are not missing
查看完整的错误消息:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/eofs/standard.py in __init__(self, dataset, weights, center, ddof)
164 # Use the parallel Dask algorithm
--> 165 dsvd = dask.array.linalg.svd(dataNoMissing)
166 A, Lh, E = (x.compute() for x in dsvd)
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/dask/array/linalg.py in svd(a)
803 """
--> 804 return tsqr(a, compute_svd=True)
805
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/dask/array/linalg.py in tsqr(data, compute_svd, _max_vchunk_size)
116 "Current shape: {},\nCurrent chunksize: {}".format(
--> 117 data.shape, data.chunksize
118 )
ValueError: Input must have the following properties:
1. Have two dimensions
2. Have only one column of blocks
Note: This function (tsqr) supports QR decomposition in the case of
tall-and-skinny matrices (single column chunk/block; see qr)Current shape: (8760, nan),
Current chunksize: (876, nan)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-17-f60250fedf8b> in <module>
----> 1 pca.analyze()
~/code/tsa_lib/tsa_lib/time_tools.py in f(*args, **kwargs)
8 def f(*args, **kwargs):
9 before = time.perf_counter() # maybe exchange with time.process_time()
---> 10 rv = func(*args, **kwargs)
11 after = time.perf_counter()
12 print('elapsed time for {.__name__}: {:.2f} minutes'.format(func, (after - before)/60))
~/code/playground/playground/PCA.py in analyze(self)
145 print('PCA completed. Weights used.')
146 else:
--> 147 self.eofs, self.pcs, self.solver = eof_analysis(self.data_variability, n_eofs=None, xarray=True)
148 print('PCA completed. No weights used.')
149
~/code/tsa_lib/tsa_lib/time_tools.py in f(*args, **kwargs)
8 def f(*args, **kwargs):
9 before = time.perf_counter() # maybe exchange with time.process_time()
---> 10 rv = func(*args, **kwargs)
11 after = time.perf_counter()
12 print('elapsed time for {.__name__}: {:.2f} minutes'.format(func, (after - before)/60))
~/code/playground/playground/PCA.py in eof_analysis(data, n_eofs, xarray, wgts, lats)
36 solver = xEof(data, weights=wgts)
37 else:
---> 38 solver = xEof(data)
39
40 eofs = solver.eofsAsCovariance(neofs=n_eofs)
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/eofs/xarray.py in __init__(self, array, weights, center, ddof)
131 weights=wtarray,
132 center=center,
--> 133 ddof=ddof)
134 # Name of the input DataArray.
135 self._name = array.name
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/eofs/standard.py in __init__(self, dataset, weights, center, ddof)
175
176 except (np.linalg.LinAlgError, ValueError):
--> 177 raise ValueError('error encountered in SVD, check that missing '
178 'values are in the same places at each time and '
179 'that all the values are not missing')
ValueError: error encountered in SVD, check that missing values are in the same places at each time and that all the values are not missing
将SVD应用于栅格化数据集时,将给出以下提到的错误。是否可能由于单个块可能仅包含NaN值而引发错误?
如果是这样,则可以将其视为dask.distributed的错误,因为在不分块地应用SVD时它可以正常工作。因此,SVD不应仅因为单个块仅包含NaN值而其他块包含有效值而失败吗?