python多处理池从不以apply_async开头

时间:2019-11-21 21:58:44

标签: python pandas multiprocessing gan

当我调试脚本时,池会循环遍历每个项目,然后直接关闭而不会继续执行我的函数。

这很奇怪,因为我在另一个运行正常的脚本中使用了几乎相同的结构。

我的第一个脚本运行正常:

# My dataset
# Columns: Absence (0-5), School (5 values), Week (11 values), Grade Level (9 values)
# Additional columns for creating IDs: Student ID, ID

import logging
import pandas as pd
import numpy as np
import random
import multiprocessing as mp

# Queens
Q_ATTENDANCE_RAW = r"D:\Fake Data\Full Q Absences This Year.csv"
Q_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned Q Attendance.csv"

# Brooklyn
K_ATTENDANCE_RAW = r"D:\Fake Data\Full K Absences This Year.csv"
K_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned K Attendance.csv"

# Manhattan
M_ATTENDANCE_RAW = r"D:\Fake Data\Full M Absences This Year.csv"
M_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned M Attendance.csv"

# Staten Island
R_ATTENDANCE_RAW = r"D:\Fake Data\Full R Absences This Year.csv"
R_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned R Attendance.csv"

BORO_DICT = {Q_ATTENDANCE_RAW : Q_ATTENDANCE_OUTPUT,
             K_ATTENDANCE_RAW : K_ATTENDANCE_OUTPUT,
             M_ATTENDANCE_RAW : M_ATTENDANCE_OUTPUT,
             R_ATTENDANCE_RAW : R_ATTENDANCE_OUTPUT}

desired_columns = ['absent', 'School_DBN', 'week', 'grade_LVL']
# [['student_id, ID']]

def process_frame(input_file, output_file):
    df = pd.read_csv(input_file,
                     usecols=desired_columns)

    desired_column_names = ['Count of Absences', 'DBN', 'Absence Week', 'Grade']

    df.columns = desired_column_names

    df.to_csv(output_file,
              index=False)

    logger.info(f'Done with {input_file}')

def test(i, o):
        f = open(o,'w')
        f.write(o)
        f.close()

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # we have 4 cores, so this is the max we can parallel process
    pool = mp.Pool(mp.cpu_count())
    # alternately pass in mp.cpu_count()

    for k, v in BORO_DICT.items():
        # pool.apply_async(test, args=(k, v), callback=collect_results)
        pool.apply_async(process_frame, args=(k, v))
    # pool.map(test, BORO_DICT.items())

    # close pool to let all processes complete
    pool.close()

    # postpone execution of next line until all queue processes are done
    pool.join()
    # logger.info(results)
    logger.info('Done')



我的第二个脚本不起作用(无错误消息,只是不提供输出):


import logging
import os

import pandas as pd
import torch
import multiprocessing as mp

from dpwgan import CategoricalDataset
from dpwgan.utils import create_categorical_gan
from examples.create_absence_data import BORO_DICT

OUTPUT = r"D:\Fake Data\Fake Q Absences.csv"

THIS_DIR = os.path.dirname(os.path.realpath(__file__))

NOISE_DIM = 20
HIDDEN_DIM = 20
SIGMA = 1


def process(file):
    torch.manual_seed(123)
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Preparing data set...')
    try:
        df = pd.read_csv(file, dtype=str)
    except FileNotFoundError:
        print('Error: Data files do not exist.\n'
              'Please run `create_absence_data.py` first.')
        return
    dataset = CategoricalDataset(df)
    data = dataset.to_onehot_flat()

    gan = create_categorical_gan(NOISE_DIM, HIDDEN_DIM, dataset.dimensions)

    logger.info('Training GAN...')
    gan.train(data=data,
              epochs=10, # should be 50 but I'm impatient
              n_critics=5,
              learning_rate=1e-4,
              weight_clip=1/HIDDEN_DIM,
              sigma=SIGMA)

    logger.info('Generating synthetic data...')
    flat_synthetic_data = gan.generate(len(df))
    synthetic_data = dataset.from_onehot_flat(flat_synthetic_data)

    if 'K' in file:
        OUTPUT = r"D:\Fake Data\Fake K Attendance.csv"
    elif 'Q' in file:
        OUTPUT = r"D:\Fake Data\Fake Q Attendance.csv"
    elif 'R' in file:
        OUTPUT = r"D:\Fake Data\Fake R Attendance.csv"
    elif 'M' in file:
        OUTPUT = r"D:\Fake Data\Fake M Attendance.csv"

    synthetic_data.to_csv(OUTPUT, index=False)

    logger.info('Synthetic data saved to {}'.format(OUTPUT))


if __name__ == '__main__':
    pool = mp.Pool(4)
    for k, v in BORO_DICT.items():
        pool.apply_async(process, v)
    pool.close()

顺便说一下,GAN部分来自github上的civis项目。

1 个答案:

答案 0 :(得分:0)

apply_async无效。我切换到map可以正常工作,并且根据评论中的建议还添加了上下文管理:

    with mp.Pool(processes=4) as pool:
        pool = mp.Pool(4)
        pool.map(process,BORO_DICT.values())