我在尝试使用dask处理较大数据集的相对较小的子集时收到以下错误消息。我在运行dask的GridSearchCV实现时首先开始收到此错误消息。但是,当我尝试通过将数据帧写入磁盘来解决它时。错误消息如下,然后是我尝试运行的代码。
Traceback(最近一次呼叫最后一次):
文件“C:\ Program Files(x86)\ JetBrains \ PyCharm 2017.1 \ helpers \ pydev \ pydevd.py”,第1578行,
globals = debugger.run(setup ['file'],None,None,is_module)
文件“C:\ Program Files(x86)\ JetBrains \ PyCharm 2017.1 \ helpers \ pydev \ pydevd.py”,第1015行,在运行中
pydev_imports.execfile(文件,全局,本地)#执行脚本
文件“C:/ Users / Evan / Git / Data_Analysis_for_Exp / SEER Data / SEER_ML.py”,第73行,
data.to_csv( 'data_frame _ *。CSV')
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ dataframe \ core.py”,第957行,在to_csv中
返回to_csv(self,filename,** kwargs)
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ _ dataframe \ io \ csv.py”,第445行,在to_csv中
延迟(值).compute(GET = GET)
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ base.py”,第95行,计算中
(result,)= compute(self,traverse = False,** kwargs)
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ base.py”,第202行,计算中
results = get(dsk,keys,** kwargs)
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ threaded.py”,第76行,获取 ** kwargs)
get_async中的文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ async.py”,第500行 raise(remote_exception(res,tb))
dask.async.MemoryError:
回溯
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ async.py”,第266行,在execute_task中
result = _execute_task(任务,数据)
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ async.py”,第247行,在_execute_task中
return func(* args2)
文件“C:\ Anaconda2 \ lib \ site-packages \ dask \ _ dataframe \ io \ csv.py”,第55行,pandas_read_text
df =读者(生物,** kwargs)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ io \ parsers.py”,第646行,在parser_f中
return _read(filepath_or_buffer,kwds)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ io \ parsers.py”,第401行,在_read
data = parser.read()
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ io \ parsers.py”,第957行,阅读
df = DataFrame(col_dict,columns = columns,index = index)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ frame.py”,第266行, init
mgr = self._init_dict(数据,索引,列,dtype = dtype)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ frame.py”,第402行,在_init_dict
return _arrays_to_mgr(arrays,data_names,index,columns,dtype = dtype)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ frame.py”,第5408行,在_arrays_to_mgr
返回create_block_manager_from_arrays(arrays,arr_names,axes)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ internals.py”,第4262行,在create_block_manager_from_arrays中
blocks = form_blocks(数组,名称,轴)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ internals.py”,第4331行,form_blocks
float_blocks = _multi_blockify(float_items)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ internals.py”,第4408行,_multi_blockify
values,placement = _stack_arrays(list(tup_block),dtype)
文件“C:\ Anaconda2 \ lib \ site-packages \ pandas \ core \ internals.py”,第4451行,_stack_arrays
stacked = np.empty(shape,dtype = dtype)
data = dataframe.read_csv("Data_Set_for_ML.csv", dtype={'colx': float, 'coly': float,'colz': float})
targets = data['target_col']
drop_these_too = ['target_col', "related_col1", "PID", "related_col2"]
drops = [key for key in list(list_of_features) if key not in
list(selected_features) and key not in drop_these_too]
data = data.drop(drops, axis=1)
for col in list(data.columns):
data[col].astype(float)
data.fillna(-9)
data.to_csv('data_frame_*.csv')