当我调试脚本时,池会循环遍历每个项目,然后直接关闭而不会继续执行我的函数。
这很奇怪,因为我在另一个运行正常的脚本中使用了几乎相同的结构。
我的第一个脚本运行正常:
# My dataset
# Columns: Absence (0-5), School (5 values), Week (11 values), Grade Level (9 values)
# Additional columns for creating IDs: Student ID, ID
import logging
import pandas as pd
import numpy as np
import random
import multiprocessing as mp
# Queens
Q_ATTENDANCE_RAW = r"D:\Fake Data\Full Q Absences This Year.csv"
Q_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned Q Attendance.csv"
# Brooklyn
K_ATTENDANCE_RAW = r"D:\Fake Data\Full K Absences This Year.csv"
K_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned K Attendance.csv"
# Manhattan
M_ATTENDANCE_RAW = r"D:\Fake Data\Full M Absences This Year.csv"
M_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned M Attendance.csv"
# Staten Island
R_ATTENDANCE_RAW = r"D:\Fake Data\Full R Absences This Year.csv"
R_ATTENDANCE_OUTPUT = r"D:\Fake Data\Cleaned R Attendance.csv"
BORO_DICT = {Q_ATTENDANCE_RAW : Q_ATTENDANCE_OUTPUT,
K_ATTENDANCE_RAW : K_ATTENDANCE_OUTPUT,
M_ATTENDANCE_RAW : M_ATTENDANCE_OUTPUT,
R_ATTENDANCE_RAW : R_ATTENDANCE_OUTPUT}
desired_columns = ['absent', 'School_DBN', 'week', 'grade_LVL']
# [['student_id, ID']]
def process_frame(input_file, output_file):
df = pd.read_csv(input_file,
usecols=desired_columns)
desired_column_names = ['Count of Absences', 'DBN', 'Absence Week', 'Grade']
df.columns = desired_column_names
df.to_csv(output_file,
index=False)
logger.info(f'Done with {input_file}')
def test(i, o):
f = open(o,'w')
f.write(o)
f.close()
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# we have 4 cores, so this is the max we can parallel process
pool = mp.Pool(mp.cpu_count())
# alternately pass in mp.cpu_count()
for k, v in BORO_DICT.items():
# pool.apply_async(test, args=(k, v), callback=collect_results)
pool.apply_async(process_frame, args=(k, v))
# pool.map(test, BORO_DICT.items())
# close pool to let all processes complete
pool.close()
# postpone execution of next line until all queue processes are done
pool.join()
# logger.info(results)
logger.info('Done')
我的第二个脚本不起作用(无错误消息,只是不提供输出):
import logging
import os
import pandas as pd
import torch
import multiprocessing as mp
from dpwgan import CategoricalDataset
from dpwgan.utils import create_categorical_gan
from examples.create_absence_data import BORO_DICT
OUTPUT = r"D:\Fake Data\Fake Q Absences.csv"
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
NOISE_DIM = 20
HIDDEN_DIM = 20
SIGMA = 1
def process(file):
torch.manual_seed(123)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info('Preparing data set...')
try:
df = pd.read_csv(file, dtype=str)
except FileNotFoundError:
print('Error: Data files do not exist.\n'
'Please run `create_absence_data.py` first.')
return
dataset = CategoricalDataset(df)
data = dataset.to_onehot_flat()
gan = create_categorical_gan(NOISE_DIM, HIDDEN_DIM, dataset.dimensions)
logger.info('Training GAN...')
gan.train(data=data,
epochs=10, # should be 50 but I'm impatient
n_critics=5,
learning_rate=1e-4,
weight_clip=1/HIDDEN_DIM,
sigma=SIGMA)
logger.info('Generating synthetic data...')
flat_synthetic_data = gan.generate(len(df))
synthetic_data = dataset.from_onehot_flat(flat_synthetic_data)
if 'K' in file:
OUTPUT = r"D:\Fake Data\Fake K Attendance.csv"
elif 'Q' in file:
OUTPUT = r"D:\Fake Data\Fake Q Attendance.csv"
elif 'R' in file:
OUTPUT = r"D:\Fake Data\Fake R Attendance.csv"
elif 'M' in file:
OUTPUT = r"D:\Fake Data\Fake M Attendance.csv"
synthetic_data.to_csv(OUTPUT, index=False)
logger.info('Synthetic data saved to {}'.format(OUTPUT))
if __name__ == '__main__':
pool = mp.Pool(4)
for k, v in BORO_DICT.items():
pool.apply_async(process, v)
pool.close()
顺便说一下,GAN部分来自github上的civis项目。
答案 0 :(得分:0)
apply_async
无效。我切换到map
可以正常工作,并且根据评论中的建议还添加了上下文管理:
with mp.Pool(processes=4) as pool:
pool = mp.Pool(4)
pool.map(process,BORO_DICT.values())