Python多线程Manager.list(),如何正确访问数据

时间:2014-01-12 22:04:48

标签: python multithreading

我正在构建一个在其自己的进程中启动的类,并以批量大小将数据推送到我的数据库。该类使用Manager.list()来获取数据。我认为这将是一个常见的模式,数据库访问在一个单独的过程中,但我找不到合适的库,所以我想我会自己动手。

我在内部使用threading.Timer来唤醒我的数据库工作者并检查共享队列。然而,当它醒来时,队列中没有任何东西(尽管内容被放在那里)。我错误地使用了Manager.list()吗?

源代码:

import random
from threading import Timer
import threading

from sqlalchemy import *
from multiprocessing import Process, Manager
from util.config import get_connection


def __convert_to_key(connection, table):
    return "{}.{}".format(connection.name, table.name)


class ConnectionWorker():
    __batch_size = 1000
    __batch_insert_queue = None
    __manager = Manager()
    __wait_interval = 5.0
    __finish = False
    __connection = None
    __table = None
    __timer = None
    finished = False

    def __init__(self, connection, table):
        self.__lock = threading.RLock()
        self.__connection = connection
        self.__table = table
        p = Process(target=self.__insert_data)
        p.start()

    def get_batch_insert_queue(self):
        self.__lock.acquire()
        try:
            if self.__batch_insert_queue is None:
                self.__batch_insert_queue = self.__manager.list()
            return self.__batch_insert_queue
        finally:
            self.__lock.release()

    def __insert_data(self):
        print("__insert_data, the queue is {}".format(len(self.get_batch_insert_queue())))
        q = self.get_batch_insert_queue()

        #push everything now if we have been told to finish
        if self.__finish:
            print("__finish flag has been set")
            self.__connection.execute(self.__table.insert().values(q))
            self.finished = True
            return

        #if there is nothing to do then just sleep
        if len(q) == 0:
            print("The queue is empty, sleeping")
            self.__timer = Timer(self.__wait_interval, self.__insert_data)
            self.__timer.start()
            self.__timer.join()

        values_to_insert = []
        while len(q) > 0 and len(values_to_insert) < self.__batch_size:
            values_to_insert.append(q.pop)
        print("Inserting {} values".format(len(values_to_insert)))
        self.__connection.execute(self.__table.insert().values(values_to_insert))

        #don't sleep if the queue has more work to do
        if len(q) >= self.__batch_size:
            print("Not sleeping, there is more work to be done, {} items".format(len(q)))
            self.__insert_data()
        else:
            print("Sleeping")
            self.__timer = Timer(self.__wait_interval, self.__insert_data).start()
            self.__timer.start()
            self.__timer.join()

    def finish(self):
        print("Setting finish to true")
        self.__finish = True

#test data
if __name__ == "__main__":
    #create the db and get metadata
    conn = get_connection()
    query = "DROP TABLE IF EXISTS tmp_test"
    try:
        conn.execute(query)
    except:
        pass
    query = """CREATE TABLE tmp_test (
    value bigint DEFAULT NULL
    ) ENGINE=InnoDB;"""
    conn.execute(query)

    metadata = MetaData()
    metadata.reflect(bind=conn)
    tbl = metadata.tables["tmp_test"]

    c = ConnectionWorker(conn, tbl)
    q = c.get_batch_insert_queue()
    for item in random.sample(xrange(1, 1000000000), 100000):
        q.append(item)
    print("The queue is {}".format(len(q)))
    print("The batch queue is {}".format(len(c.get_batch_insert_queue())))
    import time
    time.sleep(10)
    c.finish()

    while not c.finished:
        time.sleep(1)

运行日志:

__insert_data, the queue is 0
The queue is empty, sleeping
The queue is 100000
The batch queue is 100000
__insert_data, the queue is 0
The queue is empty, sleeping
__insert_data, the queue is 0
The queue is empty, sleeping
Setting finish to true
__insert_data, the queue is 0
The queue is empty, sleeping

第一个队列是空的是有意义的(对象初始化),但接下来的两个看起来应该在它们中有项目。我还不清楚为什么当完成对象设置为True时,工作人员让它超过自我._ 完成检查(我认为它应该打印“ _finish flag已设置”)。

欢迎评论(以及指向默认情况下可能处理所有这些内容的库的指针)。

1 个答案:

答案 0 :(得分:0)

跨进程永远不会隐式共享数据。两个后果:

  1. 主程序中创建的Manager.list()与工作进程中创建的Manager.list()无关;和,

  2. 主程序中的self.__finish属性与工作进程中的self.__finish属性无关。

  3. 你应该真的退缩并尝试更简单的代码,直到这些东西对你更有意义。 “通常”这样做的方法是沿着这些方向(我已经抛弃了所有的类和方法,所以有可能看到这里重要的东西)。请注意,不需要额外的线程,或sleep等等:

    # shared data must be passed, even mp data structures
    def worker(q):
        values_to_insert = []
        while True:
            item = q.get() # no need to sleep - blocks until data is ready
            if item is None:
                break
            values_to_insert.append(item)
            if len(values_to_insert) >= 39: # whatever - your `__batch_size`
                print "processing", values_to_insert
                values_to_insert = []
        # deal with any leftovers
        if values_to_insert:
            print "processing", values_to_insert
    
    if __name__ == "__main__":
        import multiprocessing as mp
        import random
    
        q = mp.Queue(100)  # bounded queue
        proc = mp.Process(target=worker, args=(q,))
        proc.start()
        for item in random.sample(xrange(1, 1000000000), 100000):
            # will block if q has more than 100 items; blocking
            # waits for worker to catch up
            q.put(item)
        q.put(None)  # tell worker we're done
        proc.join()