Question

我的目录包含csv文件-> 'data' 我将每个csv作为带有chunksize参数的迭代器加载到数据帧中-> inner_it 结果是迭代器列表-> ll 我想将每个inner_it的所有块加载到队列中。如何以一种干净的方式做到这一点？

我目前正在这样做：

import os

import pandas as pd


def sample_gen(df):
    yield next(df)


def get_next(df, qq):
    try:
        while True:
            z = next(df)
            print(z.shape)
    except StopIteration:
        pass
    finally:
        qq.append(z)
        return qq


ll = iter([pd.read_csv(os.path.join(f'data/{x}'), chunksize=10**6) for x in os.listdir('data')])
qq = []


def load_queue(ll, qq):
    try:
        inner_it = next(ll)
        qq = get_next(inner_it, qq)
    except StopIteration:
        load_queue(ll, qq)
    finally:
        return qq, ll

我不知道如何计算load_queue

编辑： 我决定将迭代器列表放平，并改为使用生成器。这是我下面的最终解决方案：

import os
import threading
import concurrent.futures
import queue
import time
import pandas as pd


def producer(queue, event):
    ll = (pd.read_csv(os.path.join(f'data/{x}'), chunksize=10 ** 6) for x in os.listdir('data'))
    ll = (chunk for each_iterator in ll for chunk in each_iterator)

    while True:
        try:
            message = next(ll)
            queue.put(message, "P")
        except Exception as ex:
            print(ex)
            event.set()
            break
    print('producer got exit event')


def consumer(queue, event):
    while not event.is_set():
        message = queue.get()
        print(message.shape, 'C')
    print('consumer got exit event')


if __name__ == '__main__':
    pipeline = queue.Queue(maxsize=10)
    event = threading.Event()

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(producer, pipeline, event)
        executor.submit(consumer, pipeline, event)

从迭代器列表到内部迭代器的数据块队列的python迭代器

0 个答案: