我是dask的新手。我正在尝试合并两个数据帧(一个适合大熊猫数据帧大小的小文件,但为方便起见,我将它用作dask数据帧,另一个非常大)。我尝试将结果保存在csv文件中,因为我知道它可能不适合数据帧。
import pandas as pd
import dask.dataframe as dd
AF=dd.read_csv("../data/AuthorFieldOfStudy.csv")
AF.columns=['AID','FID']
#extract subset of Authors to reduce final merge size
AF = AF.loc[AF['FID'] == '0271BC14']
#This is a large file 9 MB
PAA=dd.read_csv("../data/PAA.csv")
PAA.columns=['PID','AID', 'AffID']
result = dd.merge(AF,PAA, on='AID')
result.to_csv("../data/CompSciPaperAuthorAffiliations.csv").compute()
我收到以下错误,并不太明白:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-1-6b2f889f44ff> in <module>()
14 result = dd.merge(AF,PAA, on='AID')
15
---> 16 result.to_csv("../data/CompSciPaperAuthorAffiliations.csv").compute()
/usr/local/lib/python2.7/dist-packages/dask/dataframe/core.pyc in to_csv(self, filename, **kwargs)
936 """ See dd.to_csv docstring for more information """
937 from .io import to_csv
--> 938 return to_csv(self, filename, **kwargs)
939
940 def to_delayed(self):
/usr/local/lib/python2.7/dist-packages/dask/dataframe/io/csv.pyc in to_csv(df, filename, name_function, compression, compute, get, **kwargs)
411 if compute:
412 from dask import compute
--> 413 compute(*values, get=get)
414 else:
415 return values
/usr/local/lib/python2.7/dist-packages/dask/base.pyc in compute(*args, **kwargs)
177 dsk = merge(var.dask for var in variables)
178 keys = [var._keys() for var in variables]
--> 179 results = get(dsk, keys, **kwargs)
180
181 results_iter = iter(results)
/usr/local/lib/python2.7/dist-packages/dask/threaded.pyc in get(dsk, result, cache, num_workers, **kwargs)
74 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
75 cache=cache, get_id=_thread_get_id,
---> 76 **kwargs)
77
78 # Cleanup pools associated to dead threads
/usr/local/lib/python2.7/dist-packages/dask/async.pyc in get_async(apply_async, num_workers, dsk, result, cache, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, dumps, loads, **kwargs)
491 _execute_task(task, data) # Re-execute locally
492 else:
--> 493 raise(remote_exception(res, tb))
494 state['cache'][key] = res
495 finish_task(dsk, key, state, results, keyorder.get)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 14: ordinal not in range(128)
Traceback
---------
File "/usr/local/lib/python2.7/dist-packages/dask/async.py", line 268, in execute_task
result = _execute_task(task, data)
File "/usr/local/lib/python2.7/dist-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/usr/local/lib/python2.7/dist-packages/dask/dataframe/shuffle.py", line 329, in collect
res = p.get(part)
File "/usr/local/lib/python2.7/dist-packages/partd/core.py", line 73, in get
return self.get([keys], **kwargs)[0]
File "/usr/local/lib/python2.7/dist-packages/partd/core.py", line 79, in get
return self._get(keys, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/partd/encode.py", line 30, in _get
for chunk in raw]
File "/usr/local/lib/python2.7/dist-packages/partd/pandas.py", line 144, in deserialize
for block, dt, shape in zip(b_blocks, dtypes, shapes)]
File "/usr/local/lib/python2.7/dist-packages/partd/numpy.py", line 127, in deserialize
l = decode(l)
File "/usr/local/lib/python2.7/dist-packages/partd/numpy.py", line 114, in decode
return list(map(decode, o))
File "/usr/local/lib/python2.7/dist-packages/partd/numpy.py", line 110, in decode
return [item.decode() for item in o]