我正在构建一个在其自己的进程中启动的类,并以批量大小将数据推送到我的数据库。该类使用Manager.list()
来获取数据。我认为这将是一个常见的模式,数据库访问在一个单独的过程中,但我找不到合适的库,所以我想我会自己动手。
我在内部使用threading.Timer
来唤醒我的数据库工作者并检查共享队列。然而,当它醒来时,队列中没有任何东西(尽管内容被放在那里)。我错误地使用了Manager.list()
吗?
源代码:
import random
from threading import Timer
import threading
from sqlalchemy import *
from multiprocessing import Process, Manager
from util.config import get_connection
def __convert_to_key(connection, table):
return "{}.{}".format(connection.name, table.name)
class ConnectionWorker():
__batch_size = 1000
__batch_insert_queue = None
__manager = Manager()
__wait_interval = 5.0
__finish = False
__connection = None
__table = None
__timer = None
finished = False
def __init__(self, connection, table):
self.__lock = threading.RLock()
self.__connection = connection
self.__table = table
p = Process(target=self.__insert_data)
p.start()
def get_batch_insert_queue(self):
self.__lock.acquire()
try:
if self.__batch_insert_queue is None:
self.__batch_insert_queue = self.__manager.list()
return self.__batch_insert_queue
finally:
self.__lock.release()
def __insert_data(self):
print("__insert_data, the queue is {}".format(len(self.get_batch_insert_queue())))
q = self.get_batch_insert_queue()
#push everything now if we have been told to finish
if self.__finish:
print("__finish flag has been set")
self.__connection.execute(self.__table.insert().values(q))
self.finished = True
return
#if there is nothing to do then just sleep
if len(q) == 0:
print("The queue is empty, sleeping")
self.__timer = Timer(self.__wait_interval, self.__insert_data)
self.__timer.start()
self.__timer.join()
values_to_insert = []
while len(q) > 0 and len(values_to_insert) < self.__batch_size:
values_to_insert.append(q.pop)
print("Inserting {} values".format(len(values_to_insert)))
self.__connection.execute(self.__table.insert().values(values_to_insert))
#don't sleep if the queue has more work to do
if len(q) >= self.__batch_size:
print("Not sleeping, there is more work to be done, {} items".format(len(q)))
self.__insert_data()
else:
print("Sleeping")
self.__timer = Timer(self.__wait_interval, self.__insert_data).start()
self.__timer.start()
self.__timer.join()
def finish(self):
print("Setting finish to true")
self.__finish = True
#test data
if __name__ == "__main__":
#create the db and get metadata
conn = get_connection()
query = "DROP TABLE IF EXISTS tmp_test"
try:
conn.execute(query)
except:
pass
query = """CREATE TABLE tmp_test (
value bigint DEFAULT NULL
) ENGINE=InnoDB;"""
conn.execute(query)
metadata = MetaData()
metadata.reflect(bind=conn)
tbl = metadata.tables["tmp_test"]
c = ConnectionWorker(conn, tbl)
q = c.get_batch_insert_queue()
for item in random.sample(xrange(1, 1000000000), 100000):
q.append(item)
print("The queue is {}".format(len(q)))
print("The batch queue is {}".format(len(c.get_batch_insert_queue())))
import time
time.sleep(10)
c.finish()
while not c.finished:
time.sleep(1)
运行日志:
__insert_data, the queue is 0
The queue is empty, sleeping
The queue is 100000
The batch queue is 100000
__insert_data, the queue is 0
The queue is empty, sleeping
__insert_data, the queue is 0
The queue is empty, sleeping
Setting finish to true
__insert_data, the queue is 0
The queue is empty, sleeping
第一个队列是空的是有意义的(对象初始化),但接下来的两个看起来应该在它们中有项目。我还不清楚为什么当完成对象设置为True时,工作人员让它超过自我._ 完成检查(我认为它应该打印“ _finish flag已设置”)。
欢迎评论(以及指向默认情况下可能处理所有这些内容的库的指针)。
答案 0 :(得分:0)
跨进程永远不会隐式共享数据。两个后果:
主程序中创建的Manager.list()
与工作进程中创建的Manager.list()
无关;和,
主程序中的self.__finish
属性与工作进程中的self.__finish
属性无关。
你应该真的退缩并尝试更简单的代码,直到这些东西对你更有意义。 “通常”这样做的方法是沿着这些方向(我已经抛弃了所有的类和方法,所以有可能看到这里重要的东西)。请注意,不需要额外的线程,或sleep
等等:
# shared data must be passed, even mp data structures
def worker(q):
values_to_insert = []
while True:
item = q.get() # no need to sleep - blocks until data is ready
if item is None:
break
values_to_insert.append(item)
if len(values_to_insert) >= 39: # whatever - your `__batch_size`
print "processing", values_to_insert
values_to_insert = []
# deal with any leftovers
if values_to_insert:
print "processing", values_to_insert
if __name__ == "__main__":
import multiprocessing as mp
import random
q = mp.Queue(100) # bounded queue
proc = mp.Process(target=worker, args=(q,))
proc.start()
for item in random.sample(xrange(1, 1000000000), 100000):
# will block if q has more than 100 items; blocking
# waits for worker to catch up
q.put(item)
q.put(None) # tell worker we're done
proc.join()