编辑:答案是操作系统正在削减进程,因为我正在消耗所有内存
我正在产生足够的子进程来保持核心负载平均1:1,但是在一小时内的某个时刻,这个脚本可以运行数天,其中3个进程:
tipu 14804 0.0 0.0 328776 428 pts/1 Sl 00:20 0:00 python run.py
tipu 14808 64.4 24.1 2163796 1848156 pts/1 Rl 00:20 44:41 python run.py
tipu 14809 8.2 0.0 0 0 pts/1 Z 00:20 5:43 [python] <defunct>
tipu 14810 60.3 24.3 2180308 1864664 pts/1 Rl 00:20 41:49 python run.py
tipu 14811 20.2 0.0 0 0 pts/1 Z 00:20 14:04 [python] <defunct>
tipu 14812 22.0 0.0 0 0 pts/1 Z 00:20 15:18 [python] <defunct>
tipu 15358 0.0 0.0 103292 872 pts/1 S+ 01:30 0:00 grep python
我不知道为什么会这样,附件是主人和奴隶。如果需要,我可以附加mysql / pg包装器,任何建议吗?
slave.py
:
from boto.s3.key import Key
import multiprocessing
import gzip
import os
from mysql_wrapper import MySQLWrap
from pgsql_wrapper import PGSQLWrap
import boto
import re
class Slave:
CHUNKS = 250000
BUCKET_NAME = "bucket"
AWS_ACCESS_KEY = ""
AWS_ACCESS_SECRET = ""
KEY = Key(boto.connect_s3(AWS_ACCESS_KEY, AWS_ACCESS_SECRET).get_bucket(BUCKET_NAME))
S3_ROOT = "redshift_data_imports"
COLUMN_CACHE = {}
DEFAULT_COLUMN_VALUES = {}
def __init__(self, job_queue):
self.log_handler = open("logs/%s" % str(multiprocessing.current_process().name), "a");
self.mysql = MySQLWrap(self.log_handler)
self.pg = PGSQLWrap(self.log_handler)
self.job_queue = job_queue
def do_work(self):
self.log(str(os.getpid()))
while True:
#sample job in the abstract: mysql_db.table_with_date-iteration
job = self.job_queue.get()
#queue is empty
if job is None:
self.log_handler.close()
self.pg.close()
self.mysql.close()
print("good bye and good day from %d" % (os.getpid()))
self.job_queue.task_done()
break
#curtail iteration
table = job.split('-')[0]
#strip redshift table from job name
redshift_table = re.sub(r"(_[1-9].*)", "", table.split(".")[1])
iteration = int(job.split("-")[1])
offset = (iteration - 1) * self.CHUNKS
#columns redshift is expecting
#bad tables will slip through and error out, so we catch it
try:
colnames = self.COLUMN_CACHE[redshift_table]
except KeyError:
self.job_queue.task_done()
continue
#mysql fields to use in SELECT statement
fields = self.get_fields(table)
#list subtraction determining which columns redshift has that mysql does not
delta = (list(set(colnames) - set(fields.keys())))
#subtract columns that have a default value and so do not need padding
if delta:
delta = list(set(delta) - set(self.DEFAULT_COLUMN_VALUES[redshift_table]))
#concatinate columns with padded \N
select_fields = ",".join(fields.values()) + (",\\N" * len(delta))
query = "SELECT %s FROM %s LIMIT %d, %d" % (select_fields, table,
offset, self.CHUNKS)
rows = self.mysql.execute(query)
self.log("%s: %s\n" % (table, len(rows)))
if not rows:
self.job_queue.task_done()
continue
#if there is more data potentially, add it to the queue
if len(rows) == self.CHUNKS:
self.log("putting %s-%s" % (table, (iteration+1)))
self.job_queue.put("%s-%s" % (table, (iteration+1)))
#various characters need escaping
clean_rows = []
redshift_escape_chars = set( ["\\", "|", "\t", "\r", "\n"] )
in_chars = ""
for row in rows:
new_row = []
for value in row:
if value is not None:
in_chars = str(value)
else:
in_chars = ""
#escape any naughty characters
new_row.append("".join(["\\" + c if c in redshift_escape_chars else c for c in in_chars]))
new_row = "\t".join(new_row)
clean_rows.append(new_row)
rows = ",".join(fields.keys() + delta)
rows += "\n" + "\n".join(clean_rows)
offset = offset + self.CHUNKS
filename = "%s-%s.gz" % (table, iteration)
self.move_file_to_s3(filename, rows)
self.begin_data_import(job, redshift_table, ",".join(fields.keys() +
delta))
self.job_queue.task_done()
def move_file_to_s3(self, uri, contents):
tmp_file = "/dev/shm/%s" % str(os.getpid())
self.KEY.key = "%s/%s" % (self.S3_ROOT, uri)
self.log("key is %s" % self.KEY.key )
f = gzip.open(tmp_file, "wb")
f.write(contents)
f.close()
#local saving allows for debugging when copy commands fail
#text_file = open("tsv/%s" % uri, "w")
#text_file.write(contents)
#text_file.close()
self.KEY.set_contents_from_filename(tmp_file, replace=True)
def get_fields(self, table):
"""
Returns a dict used as:
{"column_name": "altered_column_name"}
Currently only the debug column gets altered
"""
exclude_fields = ["_qproc_id", "_mob_id", "_gw_id", "_batch_id", "Field"]
query = "show columns from %s" % (table)
fields = self.mysql.execute(query)
#key raw field, value mysql formatted field
new_fields = {}
#for field in fields:
for field in [val[0] for val in fields]:
if field in exclude_fields:
continue
old_field = field
if "debug_mode" == field.strip():
field = "IFNULL(debug_mode, 0)"
new_fields[old_field] = field
return new_fields
def log(self, text):
self.log_handler.write("\n%s" % text)
def begin_data_import(self, table, redshift_table, fields):
query = "copy %s (%s) from 's3://bucket/redshift_data_imports/%s' \
credentials 'aws_access_key_id=%s;aws_secret_access_key=%s' delimiter '\\t' \
gzip NULL AS '' COMPUPDATE ON ESCAPE IGNOREHEADER 1;" \
% (redshift_table, fields, table, self.AWS_ACCESS_KEY, self.AWS_ACCESS_SECRET)
self.pg.execute(query)
master.py
:
from slave import Slave as Slave
import multiprocessing
from mysql_wrapper import MySQLWrap as MySQLWrap
from pgsql_wrapper import PGSQLWrap as PGSQLWrap
class Master:
SLAVE_COUNT = 5
def __init__(self):
self.mysql = MySQLWrap()
self.pg = PGSQLWrap()
def do_work(table):
pass
def get_table_listings(self):
"""Gathers a list of MySQL log tables needed to be imported"""
query = 'show databases'
result = self.mysql.execute(query)
#turns list[tuple] into a flat list
databases = list(sum(result, ()))
#overriding during development
databases = ['db1', 'db2', 'db3']]
exclude = ('mysql', 'Database', 'information_schema')
scannable_tables = []
for database in databases:
if database in exclude:
continue
query = "show tables from %s" % database
result = self.mysql.execute(query)
#turns list[tuple] into a flat list
tables = list(sum(result, ()))
for table in tables:
exclude = ("Tables_in_%s" % database, "(", "201303", "detailed", "ltv")
#exclude any of the unfavorables
if any(s in table for s in exclude):
continue
scannable_tables.append("%s.%s-1" % (database, table))
return scannable_tables
def init(self):
#fetch redshift columns once and cache
#get columns from redshift so we can pad the mysql column delta with nulls
tables = ('table1', 'table2', 'table3')
for table in tables:
#cache columns
query = "SELECT column_name FROM information_schema.columns WHERE \
table_name = '%s'" % (table)
result = self.pg.execute(query, async=False, ret=True)
Slave.COLUMN_CACHE[table] = list(sum(result, ()))
#cache default values
query = "SELECT column_name FROM information_schema.columns WHERE \
table_name = '%s' and column_default is not \
null" % (table)
result = self.pg.execute(query, async=False, ret=True)
#turns list[tuple] into a flat list
result = list(sum(result, ()))
Slave.DEFAULT_COLUMN_VALUES[table] = result
def run(self):
self.init()
job_queue = multiprocessing.JoinableQueue()
tables = self.get_table_listings()
for table in tables:
job_queue.put(table)
processes = []
for i in range(Master.SLAVE_COUNT):
process = multiprocessing.Process(target=slave_runner, args=(job_queue,))
process.daemon = True
process.start()
processes.append(process)
#blocks this process until queue reaches 0
job_queue.join()
#signal each child process to GTFO
for i in range(Master.SLAVE_COUNT):
job_queue.put(None)
#blocks this process until queue reaches 0
job_queue.join()
job_queue.close()
#do not end this process until child processes close out
for process in processes:
process.join()
#toodles !
print("this is master saying goodbye")
def slave_runner(queue):
slave = Slave(queue)
slave.do_work()
答案 0 :(得分:6)
没有足够的信息可以肯定,但问题很可能是Slave.do_work
引发了未处理的异常。 (您的代码中有许多行可以在各种不同的条件下执行此操作。)
当你这样做时,子进程将退出。
在POSIX系统上......好吧,完整的细节有点复杂,但在简单的情况下(你在这里有什么),退出的子进程将作为<defunct>
进程保留,直到它被收获(因为父母要么wait
,要么退出。由于您的父代码在队列完成之前不会等待子代,因此确实会发生这种情况。
所以,有一个简单的胶带修复:
def do_work(self):
self.log(str(os.getpid()))
while True:
try:
# the rest of your code
except Exception as e:
self.log("something appropriate {}".format(e))
# you may also want to post a reply back to the parent
您可能还希望将大量try
分解为不同的,以便您可以区分出现问题的所有不同阶段(特别是如果其中一些意味着您需要回复,并且有些意思你没有。)
然而,看起来你正在尝试做的是完全复制multiprocessing.Pool
的行为,但是在几个地方错过了吧。这提出了一个问题:为什么不首先使用Pool
?然后,您可以使用map
系列方法之一进一步简化/优化事物。例如,您的整个Master.run
可以缩减为:
self.init()
pool = multiprocessing.Pool(Master.SLAVE_COUNT, initializer=slave_setup)
pool.map(slave_job, tables)
pool.join()
这将为您处理异常,并允许您在以后需要时返回值/异常,并允许您使用内置的logging
库而不是尝试构建自己的库,依此类推。它只需要对Slave
进行大量的次要代码更改,然后就完成了。
如果要从作业中提交新作业,最简单的方法可能是使用基于Future
的API(可以解决问题,将未来结果作为焦点,将池/执行者作为提供它们的愚蠢的事情,而不是让池成为焦点,而结果是它给出的愚蠢的东西),但是有很多方法可以用Pool
来完成它。例如,现在,您没有从每个作业返回任何内容,因此,您只需返回要执行的tables
列表。这是一个简单的例子,展示了如何做到这一点:
import multiprocessing
def foo(x):
print(x, x**2)
return list(range(x))
if __name__ == '__main__':
pool = multiprocessing.Pool(2)
jobs = [5]
while jobs:
jobs, oldjobs = [], jobs
for job in oldjobs:
jobs.extend(pool.apply(foo, [job]))
pool.close()
pool.join()
显然你可以通过用例如itertools.chain
提供的列表理解来替换整个循环来缩小这一点,并且通过将“提交者”对象传递给每个作业,可以使其更加清晰。并添加到该而不是返回新作业列表,等等。但我想尽可能明确地表明它有多少。
无论如何,如果您认为显式队列更容易理解和管理,那就去吧。只需查看multiprocessing.worker
和/或concurrent.futures.ProcessPoolExecutor
的来源,即可了解自己需要做些什么。这并不难,但是有足够多的东西你可能会出错(个人而言,当我尝试自己做这样的事情时,我总是忘记至少一个边缘情况),它正在寻找正确的代码。
或者,似乎您不能在此使用concurrent.futures.ProcessPoolExecutor
的唯一原因是您需要初始化一些每个进程的状态(boto.s3.key.Key
,MySqlWrap
等) ,对于什么可能是非常好的缓存原因。 (如果这涉及到Web服务查询,数据库连接等,您肯定不希望每个任务执行一次!)但是有几种不同的方法。
但是您可以继承ProcessPoolExecutor
并覆盖未记录的函数_adjust_process_count
(请参阅the source,了解它是多么简单)来传递您的设置功能,并且......这就是您所需要做的一切。
或者你可以混合搭配。将Future
的{{1}}换自concurrent.futures
AsyncResult
周围的multiprocessing
。