Question

将1MM +行插入wide and deep learning model投掷ValueError: GraphDef cannot be larger than 2GB：

Traceback (most recent call last):
  File "search_click.py", line 207, in <module>
    tf.app.run()
  File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
    sys.exit(main(sys.argv))
  File "search_click.py", line 204, in main
    train_and_eval()
  File "search_click.py", line 181, in train_and_eval
    m.fit(input_fn=lambda: input_fn(df_train), steps=FLAGS.train_steps)
  File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 182, in fit
    monitors=monitors)
  File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 458, in _train_model
    summary_writer=graph_actions.get_summary_writer(self._model_dir))
  File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/graph_actions.py", line 76, in get_summary_writer
    graph=ops.get_default_graph())
  File "/usr/lib/python2.7/site-packages/tensorflow/python/training/summary_io.py", line 113, in __init__
    self.add_graph(graph=graph, graph_def=graph_def)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/training/summary_io.py", line 204, in add_graph
    true_graph_def = graph.as_graph_def(add_shapes=True)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2117, in as_graph_def
    raise ValueError("GraphDef cannot be larger than 2GB.")
ValueError: GraphDef cannot be larger than 2GB.

我定义了与例子中相同的input_fn：

def input_fn(df):
  """Input builder function."""
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols)
  feature_cols.update(categorical_cols)
  # Converts the label column into a constant Tensor.
  label = tf.constant(df[LABEL_COLUMN].values)
  # Returns the feature columns and the label.
  return feature_cols, label

是否有tf.constant和tf.SparseTensor的替代品允许批量插入并避免内存错误？

Answer 1

宽和深的例子需要将数据集全部加载到内存中。如果您有大型数据集，则csv格式化输入可能需要tf.decode_csv。如果您的输入格式是自定义的，则应创建custom data reader。

Answer 2

@ilblackdragon创建的一个solution是使用dataframe.queues。不幸的是，对于分类变量，您必须先对它们进行整数编码，否则会出现错误：

ValueError: Data types for extracting pandas data must be int, float, or bool. Found: 'sex' type='object', 'embarked' type='object'

同样使用DNNClassifier并不起作用，并在编码类别上给出关键错误（例如KeyError: 'Embarked_ids'）。所以在帖子中，作者创建了自己的分类器模型。

示例代码：

# -*- coding: utf-8 -*-
# flake8: noqa ignore=E501
import tempfile

import pandas as pd
import tensorflow as tf
import tensorflow.contrib.learn as tf_learn
import tensorflow.contrib.layers as tf_layers
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Define the column names for the data sets.
LABEL_COLUMN = 'Survived'
CONTINUOUS_COLUMNS = ['Age', 'SibSp', 'Parch', 'Fare']
CATEGORICAL_COLUMNS = ['Pclass', 'Sex', 'Embarked']
CATEGORICAL_ID_COLUMNS = [col + '_ids' for col in CATEGORICAL_COLUMNS]
FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_ID_COLUMNS

HIDDEN_UNITS = [10, 20, 10]
CATEGORICAL_EMBED_SIZE = 10

LABEL_ENCODERS = {}

def get_feature_cols():
    # used in DNNClassifier which doesn't work
    continuous_features = [tf_layers.real_valued_column(col) for col in
                           CONTINUOUS_COLUMNS]
    categorical_features = [
        tf_layers.embedding_column(
            tf_layers.sparse_column_with_integerized_feature(col + '_ids', len(
                LABEL_ENCODERS[col].classes_)),
            CATEGORICAL_EMBED_SIZE)
        for col in CATEGORICAL_COLUMNS
        ]
    return continuous_features + categorical_features


def pandas_input_fn(X, y=None, batch_size=128, num_epochs=None):
    def input_fn():
        if y is not None:
            X['target'] = y
        queue = tf_learn.dataframe.queues.feeding_functions.enqueue_data(
            X, 1000, shuffle=num_epochs is None, num_epochs=num_epochs)
        if num_epochs is None:
            features = queue.dequeue_many(batch_size)
        else:
            features = queue.dequeue_up_to(batch_size)

        features = dict(zip(['index'] + list(X.columns), features))

        if y is not None:
            target = features.pop('target')
            return features, target
        return features

    return input_fn


def encode_categorical(df):
    global LABEL_ENCODERS
    for col in CATEGORICAL_COLUMNS:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
        encoder = LabelEncoder().fit(df[col])
        df[col + '_ids'] = encoder.transform(df[col])
        df.pop(col)
        LABEL_ENCODERS[col] = encoder
    return df, LABEL_ENCODERS


def dnn_tanh(features, target, hidden_units=HIDDEN_UNITS):
    global LABEL_ENCODERS
    target = tf.one_hot(target, 2, 1.0, 0.0)

    # Organize continuous features.
    final_features = [tf.expand_dims(tf.cast(features[col], tf.float32), 1) for
                      col in CONTINUOUS_COLUMNS]

    # Embed categorical variables into distributed representation.
    for col in CATEGORICAL_COLUMNS:
        feature = tf_learn.ops.categorical_variable(
            features[col + '_ids'],
            len(LABEL_ENCODERS[col].classes_),
            embedding_size=CATEGORICAL_EMBED_SIZE,
            name=col)
        final_features.append(feature)

    # Concatenate all features into one vector.
    features = tf.concat(1, final_features)

    # Deep Neural Network
    logits = tf_layers.stack(features,
                             tf_layers.fully_connected,
                             stack_args=hidden_units,
                             activation_fn=tf.tanh)
    prediction, loss = tf_learn.models.logistic_regression(logits, target)
    train_op = tf_layers.optimize_loss(loss,
                                       tf.contrib.framework.get_global_step(),
                                       optimizer='SGD',
                                       learning_rate=0.05)
    return tf.argmax(prediction, dimension=1), loss, train_op


def process_input_df(df):
    df, label_encoders = encode_categorical(df)
    y = df.pop(LABEL_COLUMN)
    X = df[CATEGORICAL_ID_COLUMNS + CONTINUOUS_COLUMNS].fillna(0)
    return X, y


def train(X, y, steps=100):
    model_dir = tempfile.mkdtemp()
    print("model dir: ", model_dir)
    classifier = tf_learn.Estimator(model_fn=dnn_tanh, model_dir=model_dir)
    classifier.fit(input_fn=pandas_input_fn(X, y), steps=steps)

    '''
    # Using DNNClassifier gives KeyError (e.g on EmbedIds)
    classifier = learn.DNNClassifier(hidden_units=[10, 20, 10],
        n_classes=2,
        feature_columns=get_feature_cols(),
        optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
    classifier.fit(X, y, batch_size=128, steps=500)
    '''

    return classifier


def predict(classifier, X):
    return list(classifier.predict(input_fn=pandas_input_fn(X, num_epochs=1),
                                   as_iterable=True))


def evaluate(classifier, X, y, steps=1):
    results = classifier.evaluate(input_fn=pandas_input_fn(X[FEATURE_COLUMNS], y),
                                  steps=steps)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))


if __name__ == '__main__':
    # DOWNLOAD TITANIC TRAIN DATA
    data = pd.read_csv('~/titanic_train.csv')  # LOAD DATA
    X, y = process_input_df(data)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    classifier = train(X_train, y_train, steps=100)
    print("accuracy_score", accuracy_score(y_test, predict(classifier, X_test)))
    evaluate(classifier, X_test, y_test, steps=1)

Answer 3

我最后使用以下受其他人启发的代码解决了这个问题。希望这段代码对其他人有所帮助。谢谢大家的上述评论。

def input_fn(batch_size,filename):
    examples_op = tf.contrib.learn.read_batch_examples(
        filename,
        batch_size=batch_size,
        reader=tf.TextLineReader,
        num_epochs=1,
        parse_fn=lambda x: tf.decode_csv(x, [tf.constant([''], dtype=tf.string)] * len(HEADERS)))

    examples_dict = {}
    for i, header in enumerate(HEADERS):
        examples_dict[header] = examples_op[:, i]

    feature_cols = {k: tf.string_to_number(examples_dict[k], out_type=tf.float32)
                    for k in CONTINUOUS_COLUMNS}

    feature_cols.update({k: dense_to_sparse(examples_dict[k])
                         for k in CATEGORICAL_COLUMNS})

    label = tf.string_to_number(examples_dict[LABEL_COLUMN], out_type=tf.int32)

    return feature_cols, label

def input_fn_pre(batch_size,filename):
    examples_op = tf.contrib.learn.read_batch_examples(
        filename,
        batch_size=batch_size,
        reader=tf.TextLineReader,
        num_epochs=1,
        parse_fn=lambda x: tf.decode_csv(x, [tf.constant([''], dtype=tf.string)] * len(HEADERS)))

    examples_dict = {}
    for i, header in enumerate(HEADERS):
        examples_dict[header] = examples_op[:, i]

    feature_cols = {k: tf.string_to_number(examples_dict[k], out_type=tf.float32)
                    for k in CONTINUOUS_COLUMNS}

    feature_cols.update({k: dense_to_sparse(examples_dict[k])
                         for k in CATEGORICAL_COLUMNS})
    return feature_cols
def dense_to_sparse(dense_tensor):
    indices = tf.to_int64(tf.transpose([tf.range(tf.shape(dense_tensor)[0]), tf.zeros_like(dense_tensor, dtype=tf.int32)]))
    values = dense_tensor
    shape = tf.to_int64([tf.shape(dense_tensor)[0], tf.constant(1)])

    return tf.SparseTensor(
        indices=indices,
        values=values,
        shape=shape
    )

def train_and_eval():
  """Train and evaluate the model."""
  data = pd.read_csv('spark_traindata_forrun_no_nanindex.csv',skipinitialspace=True,
      engine="python")
  value_range={}
  for column in CATEGORICAL_COLUMNS:
    data[column]=data[column].astype(str)
    value_range[column]=list(set(data[column]))
  model_dir = './model6'
  print("model directory = %s" % model_dir)
  test = pd.read_csv('test.csv',names= HEADERS)
  m = build_estimator(model_dir,value_range)
  m.fit(input_fn=lambda: input_fn(128,'train.csv'), steps=FLAGS.train_steps)
  results = m.evaluate(input_fn=lambda: input_fn(5000,'test.csv'), steps=1)
  for key in sorted(results):
    print("%s: %s" % (key, results[key]))

广泛的深度学习大数据错误：GraphDef不能大于2GB

3 个答案: