将1MM +行插入wide and deep learning model投掷ValueError: GraphDef cannot be larger than 2GB
:
Traceback (most recent call last):
File "search_click.py", line 207, in <module>
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv))
File "search_click.py", line 204, in main
train_and_eval()
File "search_click.py", line 181, in train_and_eval
m.fit(input_fn=lambda: input_fn(df_train), steps=FLAGS.train_steps)
File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 182, in fit
monitors=monitors)
File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 458, in _train_model
summary_writer=graph_actions.get_summary_writer(self._model_dir))
File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/graph_actions.py", line 76, in get_summary_writer
graph=ops.get_default_graph())
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/summary_io.py", line 113, in __init__
self.add_graph(graph=graph, graph_def=graph_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/summary_io.py", line 204, in add_graph
true_graph_def = graph.as_graph_def(add_shapes=True)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2117, in as_graph_def
raise ValueError("GraphDef cannot be larger than 2GB.")
ValueError: GraphDef cannot be larger than 2GB.
我定义了与例子中相同的input_fn
:
def input_fn(df):
"""Input builder function."""
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols)
feature_cols.update(categorical_cols)
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
return feature_cols, label
是否有tf.constant
和tf.SparseTensor
的替代品允许批量插入并避免内存错误?
答案 0 :(得分:1)
宽和深的例子需要将数据集全部加载到内存中。如果您有大型数据集,则csv格式化输入可能需要tf.decode_csv。如果您的输入格式是自定义的,则应创建custom data reader。
答案 1 :(得分:1)
@ilblackdragon创建的一个solution是使用dataframe.queues
。不幸的是,对于分类变量,您必须先对它们进行整数编码,否则会出现错误:
ValueError: Data types for extracting pandas data must be int, float, or bool. Found: 'sex' type='object', 'embarked' type='object'
同样使用DNNClassifier并不起作用,并在编码类别上给出关键错误(例如KeyError: 'Embarked_ids'
)。所以在帖子中,作者创建了自己的分类器模型。
示例代码:
# -*- coding: utf-8 -*-
# flake8: noqa ignore=E501
import tempfile
import pandas as pd
import tensorflow as tf
import tensorflow.contrib.learn as tf_learn
import tensorflow.contrib.layers as tf_layers
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Define the column names for the data sets.
LABEL_COLUMN = 'Survived'
CONTINUOUS_COLUMNS = ['Age', 'SibSp', 'Parch', 'Fare']
CATEGORICAL_COLUMNS = ['Pclass', 'Sex', 'Embarked']
CATEGORICAL_ID_COLUMNS = [col + '_ids' for col in CATEGORICAL_COLUMNS]
FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_ID_COLUMNS
HIDDEN_UNITS = [10, 20, 10]
CATEGORICAL_EMBED_SIZE = 10
LABEL_ENCODERS = {}
def get_feature_cols():
# used in DNNClassifier which doesn't work
continuous_features = [tf_layers.real_valued_column(col) for col in
CONTINUOUS_COLUMNS]
categorical_features = [
tf_layers.embedding_column(
tf_layers.sparse_column_with_integerized_feature(col + '_ids', len(
LABEL_ENCODERS[col].classes_)),
CATEGORICAL_EMBED_SIZE)
for col in CATEGORICAL_COLUMNS
]
return continuous_features + categorical_features
def pandas_input_fn(X, y=None, batch_size=128, num_epochs=None):
def input_fn():
if y is not None:
X['target'] = y
queue = tf_learn.dataframe.queues.feeding_functions.enqueue_data(
X, 1000, shuffle=num_epochs is None, num_epochs=num_epochs)
if num_epochs is None:
features = queue.dequeue_many(batch_size)
else:
features = queue.dequeue_up_to(batch_size)
features = dict(zip(['index'] + list(X.columns), features))
if y is not None:
target = features.pop('target')
return features, target
return features
return input_fn
def encode_categorical(df):
global LABEL_ENCODERS
for col in CATEGORICAL_COLUMNS:
if df[col].dtype == 'object':
df[col] = df[col].astype(str)
encoder = LabelEncoder().fit(df[col])
df[col + '_ids'] = encoder.transform(df[col])
df.pop(col)
LABEL_ENCODERS[col] = encoder
return df, LABEL_ENCODERS
def dnn_tanh(features, target, hidden_units=HIDDEN_UNITS):
global LABEL_ENCODERS
target = tf.one_hot(target, 2, 1.0, 0.0)
# Organize continuous features.
final_features = [tf.expand_dims(tf.cast(features[col], tf.float32), 1) for
col in CONTINUOUS_COLUMNS]
# Embed categorical variables into distributed representation.
for col in CATEGORICAL_COLUMNS:
feature = tf_learn.ops.categorical_variable(
features[col + '_ids'],
len(LABEL_ENCODERS[col].classes_),
embedding_size=CATEGORICAL_EMBED_SIZE,
name=col)
final_features.append(feature)
# Concatenate all features into one vector.
features = tf.concat(1, final_features)
# Deep Neural Network
logits = tf_layers.stack(features,
tf_layers.fully_connected,
stack_args=hidden_units,
activation_fn=tf.tanh)
prediction, loss = tf_learn.models.logistic_regression(logits, target)
train_op = tf_layers.optimize_loss(loss,
tf.contrib.framework.get_global_step(),
optimizer='SGD',
learning_rate=0.05)
return tf.argmax(prediction, dimension=1), loss, train_op
def process_input_df(df):
df, label_encoders = encode_categorical(df)
y = df.pop(LABEL_COLUMN)
X = df[CATEGORICAL_ID_COLUMNS + CONTINUOUS_COLUMNS].fillna(0)
return X, y
def train(X, y, steps=100):
model_dir = tempfile.mkdtemp()
print("model dir: ", model_dir)
classifier = tf_learn.Estimator(model_fn=dnn_tanh, model_dir=model_dir)
classifier.fit(input_fn=pandas_input_fn(X, y), steps=steps)
'''
# Using DNNClassifier gives KeyError (e.g on EmbedIds)
classifier = learn.DNNClassifier(hidden_units=[10, 20, 10],
n_classes=2,
feature_columns=get_feature_cols(),
optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
classifier.fit(X, y, batch_size=128, steps=500)
'''
return classifier
def predict(classifier, X):
return list(classifier.predict(input_fn=pandas_input_fn(X, num_epochs=1),
as_iterable=True))
def evaluate(classifier, X, y, steps=1):
results = classifier.evaluate(input_fn=pandas_input_fn(X[FEATURE_COLUMNS], y),
steps=steps)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
if __name__ == '__main__':
# DOWNLOAD TITANIC TRAIN DATA
data = pd.read_csv('~/titanic_train.csv') # LOAD DATA
X, y = process_input_df(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = train(X_train, y_train, steps=100)
print("accuracy_score", accuracy_score(y_test, predict(classifier, X_test)))
evaluate(classifier, X_test, y_test, steps=1)
答案 2 :(得分:0)
我最后使用以下受其他人启发的代码解决了这个问题。希望这段代码对其他人有所帮助。谢谢大家的上述评论。
def input_fn(batch_size,filename):
examples_op = tf.contrib.learn.read_batch_examples(
filename,
batch_size=batch_size,
reader=tf.TextLineReader,
num_epochs=1,
parse_fn=lambda x: tf.decode_csv(x, [tf.constant([''], dtype=tf.string)] * len(HEADERS)))
examples_dict = {}
for i, header in enumerate(HEADERS):
examples_dict[header] = examples_op[:, i]
feature_cols = {k: tf.string_to_number(examples_dict[k], out_type=tf.float32)
for k in CONTINUOUS_COLUMNS}
feature_cols.update({k: dense_to_sparse(examples_dict[k])
for k in CATEGORICAL_COLUMNS})
label = tf.string_to_number(examples_dict[LABEL_COLUMN], out_type=tf.int32)
return feature_cols, label
def input_fn_pre(batch_size,filename):
examples_op = tf.contrib.learn.read_batch_examples(
filename,
batch_size=batch_size,
reader=tf.TextLineReader,
num_epochs=1,
parse_fn=lambda x: tf.decode_csv(x, [tf.constant([''], dtype=tf.string)] * len(HEADERS)))
examples_dict = {}
for i, header in enumerate(HEADERS):
examples_dict[header] = examples_op[:, i]
feature_cols = {k: tf.string_to_number(examples_dict[k], out_type=tf.float32)
for k in CONTINUOUS_COLUMNS}
feature_cols.update({k: dense_to_sparse(examples_dict[k])
for k in CATEGORICAL_COLUMNS})
return feature_cols
def dense_to_sparse(dense_tensor):
indices = tf.to_int64(tf.transpose([tf.range(tf.shape(dense_tensor)[0]), tf.zeros_like(dense_tensor, dtype=tf.int32)]))
values = dense_tensor
shape = tf.to_int64([tf.shape(dense_tensor)[0], tf.constant(1)])
return tf.SparseTensor(
indices=indices,
values=values,
shape=shape
)
def train_and_eval():
"""Train and evaluate the model."""
data = pd.read_csv('spark_traindata_forrun_no_nanindex.csv',skipinitialspace=True,
engine="python")
value_range={}
for column in CATEGORICAL_COLUMNS:
data[column]=data[column].astype(str)
value_range[column]=list(set(data[column]))
model_dir = './model6'
print("model directory = %s" % model_dir)
test = pd.read_csv('test.csv',names= HEADERS)
m = build_estimator(model_dir,value_range)
m.fit(input_fn=lambda: input_fn(128,'train.csv'), steps=FLAGS.train_steps)
results = m.evaluate(input_fn=lambda: input_fn(5000,'test.csv'), steps=1)
for key in sorted(results):
print("%s: %s" % (key, results[key]))