在luigi自动实例化?

时间:2016-06-03 07:11:51

标签: python luigi

luigi.Task.run中,我们需要将对象序列化为files / database / etc.:

MyTask(luigi.Task):
    param = luigi.Parameter()
    def requires(self):
        AnotherTask(self.param)
    def output(self):
        luigi.LocalTarget('out_{}'.format(self.param))
    def run(self):
        with self.input().open('r') as infile:
            # instantiate incoming data
            indata = pd.read_csv(infile, index=False, parse_date=...)
        # my process
        with self.output().open('w') as outfile:
            # serialize outgoing data
            outdata.to_csv(outfile, index=False, ...)

但为了方便起见,我想跳过pd.read_csv(...)代码段,因为我必须在重复使用任务时编写相同的实例化步骤。

有没有像这样在luigi实例化的自动方式?:

AnotherTask(luigi.Task):
    param = luigi.Parameter()
    def requires(self):
        ...
    def output(self):
        ...
    def _instantiate(self):
        with self.output().open('r') as outfile:
            outdata = pd.read_csv(outfile, index=False, parse_date=...)
        return outdata

MyTask(luigi.Task):
    param = luigi.Parameter()
    def requires(self):
        AnotherTask(self.param)
    def output(self):
        luigi.LocalTarget('out_{}'.format(self.param))
    def run(self):
        # automatic instantiation via AnotherTask._instantiate()
        indata = self.input()
        # my process
        outdata = indata.someprocess()
        with self.output().open('w') as outfile:
            # serialize outgoing data
            outdata.to_csv(outfile, index=False, ...)

1 个答案:

答案 0 :(得分:0)

自我回答:

def getinstances(struct):
    if isinstance(struct, luigi.Task):
        return struct.instantiate()
    elif isinstance(struct, dict):
        return {k: getinstances(v) for k, v in six.iteritems(struct)}
    else:
        # Remaining case: assume r is iterable...
        try:
            s = list(struct)
        except TypeError:
            raise Exception('Cannot map %s to Task/dict/list' % str(struct))
    return [getinstances(r) for r in s]

class MyParentTask(luigi.Task):
    def requires(self):...
    def output(self):...
    def run(self):...
    def instantiate(self):
        with self.output().open() as outfile:
            reader = csv.reader(outfile)
            outdata = [row for row in reader]
        return outdata

class MyChildTask(luigi.Task):
    def requires(self):
        return MyParentTask()
    def output(self):...
    def run(self):
        indata = getinstances(self.requires())
        ...