Tensorflow Pandas_input_fn缓慢,匮乏的CPU / GPU

时间:2017-09-27 21:26:25

标签: python pandas numpy tensorflow tensorflow-gpu

我正在根据Tensorflow Wide and Deep教程(https://www.tensorflow.org/tutorials/wide_and_deep)中的框架开发一个广泛而深入的模型。构建旧方法(从pandas加载整个数据集,转换为张量,输入input_fn)时,模型工作正常,可以在CPU上运行。但是,要使其在GPU上运行,数据集太大而无法容纳到GPU内存中,因此必须进行批处理。

我尝试使用pandas_input_fn将数据批量处理到视频卡,注意到在准备好下一批时,我会看到活动高峰,然后是长时间间隔。奇怪的是,即使我在只有CPU的机器上运行它,也会发生这种情况。间歇几乎是完全相同的长度,所以它不是简单的视频卡通过一个简单的模型破碎比proc可以提供它更快。似乎总是等待开始加载下一批,直到最后一批完成训练。我增加了模型的复杂性,以确保它不太容易计算并且仍然存在相同的问题。我已经尝试增加分配给pandas_input_fn的线程数量,我已经尝试将队列大小增加到远远大于合理的值(10倍数据集大小),这有点帮助,但并不多。我不确定减速是在排队还是排队时,但经过一周的故障排除后我无法解决问题。

我正在使用的数据是117列,400k行。我创建了一个通用脚本,生成假值来模拟问题。然而,假列比实际列少得多,因此步骤之间的差距不会那么长,但仍然很明显。 代码如下:

import tensorflow as tf
import pandas as pd
import numpy as np
import logging
import time
import datetime
import tempfile
from math import log2
from sqlalchemy import create_engine
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.externals import joblib
from tensorflow.contrib.learn.python.learn import monitors as monitor_lib


logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO, filename='Classifier.log', filemode='a')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
logging.debug('Main method started')
tf.logging.set_verbosity(tf.logging.INFO)
start_time = time.perf_counter()
logging.info(datetime.datetime.today())

#  sess = tf.Session()

model_dir = tempfile.mkdtemp()

LABEL_COLUMN = "t"

CATEGORICAL_COLUMNS = ["a", "b", "c", "d", "e"]

CONTINUOUS_COLUMNS = ["f", "g", "h", "i"]

ALL_COLUMNS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "u", "v", "x", "y", "z"]

full_set = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "u", "v", "x", "y", "z", "t"]


model_type = ""

def input_fn(num_epochs=None, shuffle=False):
    df = pd.DataFrame(np.random.randint(0,1,size=(400000, 25)), columns=full_set)
    df.fillna(0, inplace=True)
    df[CATEGORICAL_COLUMNS].round(0)
    dfindex = df.index    
    df.reset_index(inplace=True)
    df.reindex(range(len(df)))
    df.loc[:,CATEGORICAL_COLUMNS] = df.loc[:,CATEGORICAL_COLUMNS].astype(int)
    df.loc[:,CONTINUOUS_COLUMNS] = df.loc[:,CONTINUOUS_COLUMNS].astype(float)
    with tf.device('/CPU:0'):
        return tf.estimator.inputs.pandas_input_fn(
                x=df[ALL_COLUMNS],
            y=df[LABEL_COLUMN],
            batch_size=100000,
            num_epochs=num_epochs,
            shuffle=shuffle,
            num_threads=4,
            queue_capacity=400000,
            target_column=LABEL_COLUMN)

def evaluation_input_fn(num_epochs=None, shuffle=False):
    df = pd.DataFrame(np.random.randint(0,1,size=(200000, 25)), columns=full_set)
    df.fillna(0, inplace=True)
    df[CATEGORICAL_COLUMNS].round(0)
    dfindex = df.index    
    df.reset_index(inplace=True)
    df.reindex(range(len(df)))
    df.loc[:,CATEGORICAL_COLUMNS] = df.loc[:,CATEGORICAL_COLUMNS].astype(int)
    df.loc[:,CONTINUOUS_COLUMNS] = df.loc[:,CONTINUOUS_COLUMNS].astype(float)

    return tf.estimator.inputs.pandas_input_fn(
            x=df.loc[:, ALL_COLUMNS],
            y=df.loc[:, LABEL_COLUMN],
            batch_size=200000,
            num_epochs=num_epochs,
            shuffle=shuffle,
            num_threads=1,
            target_column=LABEL_COLUMN)

def classifier(model_dir):
    with tf.device('/CPU:0'):
        logging.info('Parsing continuous columns into tensors')
        with tf.variable_scope("Continuous_Features") as scope:
            continuous_columns = [tf.feature_column.numeric_column(k) for k in CONTINUOUS_COLUMNS]

        logging.info('Parsing categorical columns into tensors')
        with tf.variable_scope("Categorical_Features") as scope:
            categorical_columns = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size=1000, dtype=tf.int32) for k in CATEGORICAL_COLUMNS]

        embedded_columns = []
        logging.info('Creating embedded columns')
        with tf.variable_scope("Embedded_Columns") as scope:
            for i in range(len(categorical_columns)):
                embedded_columns.append(tf.feature_column.embedding_column(categorical_columns[i], dimension=8))

        logging.info('Bucketizing age')
        categorical_columns.append(tf.feature_column.bucketized_column(continuous_columns[0], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75]))

        logging.info('Creating crossed columns')
        crossed_columns = [tf.feature_column.crossed_column(["a", "b"],
                                                             hash_bucket_size=int(1e4)),
                           tf.feature_column.crossed_column(["a", "c"],
                                                             hash_bucket_size=int(1e4)),
                           tf.feature_column.crossed_column(["a", "g"],
                                                             hash_bucket_size=int(1e4)),
                           tf.feature_column.crossed_column(["b", "n"],
                                                             hash_bucket_size=int(1e4)),
                           tf.feature_column.crossed_column(["m", "v"],
                                                             hash_bucket_size=int(1e4)),
                           tf.feature_column.crossed_column(["a", "m", "v"],
                                                             hash_bucket_size=int(1e6))]

        wide_columns = categorical_columns + crossed_columns

        deep_columns = continuous_columns + embedded_columns

    logging.info('Creating Classifier')
    with tf.variable_scope('Wide_and_Deep') as scope:
            clf = tf.estimator.DNNLinearCombinedClassifier(linear_feature_columns=wide_columns,
                                                               dnn_feature_columns=deep_columns,
                                                               dnn_hidden_units=[400, 200, 300],
                                                               n_classes=2,
                                                               config=tf.contrib.learn.RunConfig(log_device_placement=True,
                                                                                                  save_summary_steps=100,
                                                                                                  save_checkpoints_steps=100,
                                                                                                  keep_checkpoint_max=5,
                                                                                                  model_dir=model_dir,
                                                                                                  num_cores=0,
                                                                                                  gpu_memory_fraction=1,
                                                                                                  tf_random_seed=3))

    return clf


def build_model(model_dir, train_steps):
    model_dir = tempfile.mkdtemp() if not model_dir else model_dir
    logging.info('Declaring and training classifier')
    clf = classifier(model_dir=model_dir)    
    clf.train(input_fn=input_fn(), max_steps=train_steps) #  , max_steps=2000, monitors=[validation_monitor], hooks=hook
    logging.info('Starting model evaluation')
    results = clf.evaluate(input_fn=evaluation_input_fn(), steps=1)
    logging.debug(results)
    for i in results:
        print(i, results[i])

    return clf


def main():
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=True)
    config.gpu_options.allow_growth = True
    build_model(model_dir='C://TFLogs//DWDevGPU', train_steps=20)

if __name__ == '__main__':
    main()

我真的想要一些帮助,找出如何快速地将表格(在sql中)数据批量处理到CPU / GPU以跟上计算。它不一定是熊猫或笨蛋,我会采取任何合理的措施。到目前为止,我还没有尝试过序列化到tfrecord,因为认为从光盘读取比读取内存更快似乎很愚蠢,但我会尝试任何东西。提前感谢您的帮助。

0 个答案:

没有答案