我尝试了以下(pd是熊猫):
for i, chunk in pd.read_excel(os.path.join(INGEST_PATH,file), chunksize=5):
但是我收到了这个错误:
NotImplementedError: chunksize keyword of read_excel is not implemented
我已经尝试过搜索其他方法,但大多数是针对CSV文件,而不是xlsx,我还有pandas版本0.20.1
任何帮助表示赞赏。
答案 0 :(得分:0)
df = pd.read_excel(os.path.join(INGEST_PATH,file))
# split indexes
idxes = np.array_split(df.index.values, 5)
chunks = [df.ix[idx] for idx in idxes]
答案 1 :(得分:0)
以上的解决方案对我来说并不合适,因为文件没有被正确拆分而导致省略了最后几行..实际上它给了我一个错误,指出不相等的划分或者那种效果。
所以我写了以下内容。这适用于任何文件大小。
enter code here
url_1=r'C:/Users/t3734uk/Downloads/ML-GooGLECRASH/amp_ub/df2.csv'
target_folder=r'C:\Users\t3734uk\Downloads\ML-GooGLECRASH\amp_ub'
df = pd.read_csv(url_1)
rows,columns=df.shape
def calcRowRanges(_no_of_files):
row_ranges=[]
interval_size=math.ceil(rows/_no_of_files)
print('intrval size is ----> '+ '{}'.format(interval_size))
for n in range(_no_of_files):
row_range=(n*interval_size,(n+1)*interval_size)
# print(row_range)
if row_range[1] > rows:
row_range=(n*interval_size,rows)
# print(row_range)
row_ranges.append(row_range)
return row_ranges
def splitFile(_df_,_row_ranges):
for row_range in _row_ranges:
_df=_df_[row_range[0]:row_range[1]]
writer = pd.ExcelWriter('FILE_'+str(_row_ranges.index(row_range))+'_'+'.xlsx')
_df.to_excel(writer)
def dosplit(num_files):
row_ranges=calcRowRanges(num_files)
print(row_ranges)
print(len(row_ranges))
splitFile(df,row_ranges)
dosplit(enter_no_files_to_be_split_in)
第二个想法,以下功能更直观:
def splitFile2(_df_,no_of_splits):
_row_ranges=calcRowRanges(no_of_splits)
for row_range in _row_ranges:
_df=_df_[row_range[0]:row_range[1]]
writer = pd.ExcelWriter('FILE_'+str(_row_ranges.index(row_range))+'_'+'.xlsx')
_df.to_excel(writer)enter code here