我正在尝试使用python执行以下操作:
for n in range (1,100) :
routedat = xr.open_dataset(route_files[n])
lsmdat = xr.open_dataset(lsm_files[n])
routedat = reformat_LIS_output(routedat)
lsmdat = reformat_LIS_output(lsmdat)
for i in range(1,len(stations)):
start_date = stations[i]['Streamflow (cumecs)'].first_valid_index()
lis_date = routedat['time'][0].values
gauge_id = valid_stations[i]
gauge_lat = meta_file.loc[gauge_id,'Latitude']
gauge_lon = meta_file.loc[gauge_id,'Longitude']
if start_date >= lis_date :
route_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
lsm_sel = lsmdat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
stations[i].loc[lis_date,'Precip'] = lsm_sel['TotalPrecip_tavg']
stations[i].loc[lis_date,'Evap'] = lsm_sel['Evap_tavg']
stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
stations[i].loc[lis_date,'FloodFraction'] = route_sel['FloodedFrac_tavg']
stations[i].loc[lis_date,'RiverDepth'] = route_sel['RiverDepth_tavg']
stations[i].loc[lis_date,'SWS'] = route_sel['SWS_tavg']
stations[i].loc[lis_date,'Streamflow'] = route_sel['Streamflow_tavg']
我必须使用经纬度从lsmdat和routedat中提取信息,这些信息来自stations
列表中存储的300个数据帧。目前,代码需要8分钟才能将信息写入stations
中的数据帧。
有人可以建议我如何利用Dask并行化上述代码的第二部分吗? 谢谢。
编辑:
因此,根据评论,我尝试实现dask.delayed()
。我做了以下功能:
def build_data(stations,routedat,lsmdat,valid_stations,meta_file) :
start_date = stations[i]['Streamflow (cumecs)'].first_valid_index()
lis_date = routedat['time'][0].values
gauge_id = valid_stations[i]
gauge_lat = meta_file.loc[gauge_id,'Latitude']
gauge_lon = meta_file.loc[gauge_id,'Longitude']
route_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
lsm_sel = lsmdat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
stations[i].loc[lis_date,'Precip'] = lsm_sel['TotalPrecip_tavg']
stations[i].loc[lis_date,'Evap'] = lsm_sel['Evap_tavg']
stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
stations[i].loc[lis_date,'FloodFraction'] = route_sel['FloodedFrac_tavg']
stations[i].loc[lis_date,'RiverDepth'] = route_sel['RiverDepth_tavg']
stations[i].loc[lis_date,'SWS'] = route_sel['SWS_tavg']
stations[i].loc[lis_date,'Streamflow'] = route_sel['Streamflow_tavg']
return
如果我在一个循环打开xarray数据集的循环中运行此函数,它可以正常工作,并且数据确实正确地写入了站的元素:
for n in tqdm(range (4112,4115)) :
routedat = (xr.open_dataset)(route_files[n])
lsmdat = (xr.open_dataset)(lsm_files[n])
routedat = reformat_LIS_output(routedat)
lsmdat = reformat_LIS_output(lsmdat)
for i in range (0,10):
build_data(stations,routedat,lsmdat,valid_stations,meta_file)
但是当我尝试使用dask.delayed()
对其进行并行化时,它没有将任何内容写入station元素,并且花费了更多的时间来完成。
for n in tqdm(range (4112,4115)) :
routedat = (xr.open_dataset)(route_files[n])
lsmdat = (xr.open_dataset)(lsm_files[n])
routedat = reformat_LIS_output(routedat)
lsmdat = reformat_LIS_output(lsmdat)
build_delayed = []
for i in range(0,10):
task = dask.delayed(build_data)(stations,routedat,lsmdat,valid_stations,meta_file)
build_delayed.append(task)
dask.compute(*build_delayed)
此外,我还能够较早地将同一本地群集上的循环并行化到此脚本中。
任何人都可以帮我解决我哪里错了吗
EDIT2:我还尝试在每个daskworker执行函数时将其分配给独立变量,但输出仍然为nil。 :
start_date = [None]*len(stations)
gauge_id = [None]*len(stations)
gauge_lon = [None]*len(stations)
gauge_lat = [None]*len(stations)
route_sel = [None]*len(stations)
lsm_sel = [None]*len(stations)
def build_data(stations,routedat,lsmdat,valid_stations,meta_file,i) :
start_date[i] = stations[i]['Streamflow (cumecs)'].first_valid_index()
lis_date = routedat['time'][0].values
gauge_id[i]= valid_stations[i]
gauge_lat[i] = meta_file.loc[gauge_id[i],'Latitude']
gauge_lon[i] = meta_file.loc[gauge_id[i],'Longitude']
route_sel[i] = routedat.sel(lat=gauge_lat[i],lon=gauge_lon[i],method='nearest')
lsm_sel[i] = lsmdat.sel(lat=gauge_lat[i],lon=gauge_lon[i],method='nearest')
if stations[i].loc[lis_date,'Streamflow (cumecs)']!= math.nan :
stations[i].loc[lis_date,'Precip'] = lsm_sel[i]['TotalPrecip_tavg']
stations[i].loc[lis_date,'Evap'] = lsm_sel[i]['Evap_tavg']
stations[i].loc[lis_date,'SoilMoist_L1'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=0)
stations[i].loc[lis_date,'SoilMoist_L2'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=1)
stations[i].loc[lis_date,'SoilMoist_L3'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=2)
stations[i].loc[lis_date,'SoilMoist_L4'] = lsm_sel[i]['SoilMoist_tavg'].sel(SoilMoist_profiles=3)
stations[i].loc[lis_date,'FloodFraction'] = route_sel[i]['FloodedFrac_tavg']
stations[i].loc[lis_date,'RiverDepth'] = route_sel[i]['RiverDepth_tavg']
stations[i].loc[lis_date,'SWS'] = route_sel[i]['SWS_tavg']
stations[i].loc[lis_date,'Streamflow'] = route_sel[i]['Streamflow_tavg']
return