我有一个标准的socketserver
,看起来有点像:
import time
import socketserver
import threading
import io
class Handler(socketserver.StreamRequestHandler):
def handle(self):
return cache.handle(self.rfile, self.wfile)
class Cache:
def __init__(self):
self._runner = Runner(self.reload)
self._runner.start()
self.cache = {}
def reload(self):
# very long process that takes up to 90 minutes and 100 GB of RAM
# involves calls to pyodbc and some python processing
# here is a dummy process
cache = {}
for i in range(90):
cache[str(i).encode()] = bytes(10**9)
time.sleep(60)
self.cache = cache
@staticmethod
def _send_bytes(wfile: io.BufferedIOBase, msg: bytes) -> None:
wfile.write(len(msg).to_bytes(4, "big"))
wfile.write(msg)
def handle(self, rflie, wfile):
request = rfile.readline()
response = self.cache.get(request, b'')
self._send_bytes(wfile, response)
class Runner(threading.Thread):
"""class that runs a process every timer seconds (1 day by default)"""
def __init__(self, proc, timer=24*60*60):
super().__init__(target=self._target)
self.proc = proc
self.event = threading.Event()
self.timer = timer
def _target(self):
while not self.event.wait(self.timer):
self.proc()
if __name__ == '__main__':
cache = Cache()
with socketserver.TCPServer(("0.0.0.0", 48888), Handler) as server:
server.socket.settimeout(30)
server.serve_forever()
这是问题所在,每次重新加载在其自己的线程中运行时,服务器对请求的响应就会变得异常缓慢(等待几分钟)。实际上,它将变得几乎没有响应(客户端将超时),并且整个系统将进入一种状态,其中每个请求都需要很长时间才能响应,以至于客户端积压的积压将无法得到答复。服务器只是停止处理请求(但加载器仍每24小时运行一次)。
我对python的理解是,即使另一个加载器线程需要时间,它也不应该像这样保存GIL,但这似乎正在发生。
修改:
这(大致)是以上reload
方法所调用的:
import collections
from typing import List, Any, Dict
import pyodbc
CONN_STR = "foo"
QUERY_STR = "bar"
def load_all() -> Dict[bytes, List[ValueObject]]:
# this step takes ~10 minutes
# it connects to a sql server db using pyodbc
conn: pyodbc.Connection
with pyodbc.connect(CONN_STR) as conn:
cursor: pyodbc.Cursor = conn.execute(QUERY_STR)
rows: List[Any] = cursor.fetchall()
# this step occurs entirely in pure python (a small amount gets delegated to pandas)
# it is almost entirely building objects/dicts/lists using information from the rows
# it makes no external calls or performs any IO (other than logging)
# it takes ~ 40 minutes normally
d = {}
for row in rows:
value = ValueObject.build(row)
d[value.key] = value
results = collections.defaultdict(list)
for k, v in d.items():
results[grouper_func(k)].append(v)
return results