我的目录包含csv文件-> 'data'
我将每个csv作为带有chunksize参数的迭代器加载到数据帧中-> inner_it
结果是迭代器列表-> ll
我想将每个inner_it的所有块加载到队列中。
如何以一种干净的方式做到这一点?
我目前正在这样做:
import os
import pandas as pd
def sample_gen(df):
yield next(df)
def get_next(df, qq):
try:
while True:
z = next(df)
print(z.shape)
except StopIteration:
pass
finally:
qq.append(z)
return qq
ll = iter([pd.read_csv(os.path.join(f'data/{x}'), chunksize=10**6) for x in os.listdir('data')])
qq = []
def load_queue(ll, qq):
try:
inner_it = next(ll)
qq = get_next(inner_it, qq)
except StopIteration:
load_queue(ll, qq)
finally:
return qq, ll
我不知道如何计算load_queue
编辑: 我决定将迭代器列表放平,并改为使用生成器。这是我下面的最终解决方案:
import os
import threading
import concurrent.futures
import queue
import time
import pandas as pd
def producer(queue, event):
ll = (pd.read_csv(os.path.join(f'data/{x}'), chunksize=10 ** 6) for x in os.listdir('data'))
ll = (chunk for each_iterator in ll for chunk in each_iterator)
while True:
try:
message = next(ll)
queue.put(message, "P")
except Exception as ex:
print(ex)
event.set()
break
print('producer got exit event')
def consumer(queue, event):
while not event.is_set():
message = queue.get()
print(message.shape, 'C')
print('consumer got exit event')
if __name__ == '__main__':
pipeline = queue.Queue(maxsize=10)
event = threading.Event()
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
executor.submit(producer, pipeline, event)
executor.submit(consumer, pipeline, event)