我有一个大数据集,该数据集有两个功能,第一个功能是数据,第二个功能是标签,当我运行代码时,数据集大小约为6GB:
#data_from_dataset represent data from 4G dataset, data_from_dataset
#type is ndarray,The data_from_dataset shape is two dimension like (a
#very large num,15)
#label_from_dataset represent label from 4G dataset,,label_from_dataset type
#is ndarray also ndarray
#label_from_dataset #shape is two dimension like (a very large num,15)
data_from_dataset, label_from_dataset = load_train_data()
#calc total batch count
num_batch = len(data_from_dataset) // hp.batch_size
# Convert to tensor
X = tf.convert_to_tensor(data_from_dataset, tf.int32)
Y = tf.convert_to_tensor(label_from_dataset, tf.int32)
# Create Queues
input_queues = tf.train.slice_input_producer([X, Y])
# create batch queues
x, y = tf.train.shuffle_batch(input_queues,
num_threads=20,
batch_size=hp.batch_size,
capacity=hp.batch_size*64,
min_after_dequeue=hp.batch_size*32,
allow_smaller_final_batch=False)
等待很长时间后它运行非常缓慢,控制台提示错误如下:
Error:cannot create a tensor larger than 2GB
在这些代码行中似乎有问题:
# Convert to tensor
X = tf.convert_to_tensor(data_from_dataset, tf.int32)
Y = tf.convert_to_tensor(label_from_dataset, tf.int32)
我将代码转换为TFRECORD的代码如下:
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def save_tfrecords(data_from_dataset, label_from_dataset, desfile):
with tf.python_io.TFRecordWriter(desfile) as writer:
for i in range(len(data_from_dataset)):
features = tf.train.Features(
feature = {
"data": _int64_feature(data[i]),
"label": _int64_feature(label[i])
}
)
example = tf.train.Example(features = features)
serialized = example.SerializeToString()
writer.write(serialized)
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'data': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.string),
})
sent = features['data']
tag = features['label']
sent_decode=tf.decode_raw(sent,tf.int32)
sent_decode=tf.decode_raw(tag,tf.int32)
return sent, tag
fname_out="out.tfrecord"
save_tfrecords(data_from_dataset, label_from_dataset, fname_out)
filename_queue = tf.train.string_input_producer(fname_out, shuffle=True)
example, label = read_and_decode(filename_queue, 2)
x, y = tf.train.shuffle_batch([example, label],
num_threads=20,
batch_size=hp.batch_size,
capacity=hp.batch_size*64,
min_after_dequeue=hp.batch_size*32,
allow_smaller_final_batch=False)
它在代码行上提示错误,如下所示:
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
Error:only length-1 arrays can be converted to python scalars
如何将numpy转换为tfrecord?还有其他方法吗?
答案 0 :(得分:0)
函数tf.train.Int64List
不适用于数组。
您需要改用tf.train.BytesList
data = np.random.rand(15,)
writer = tf.python_io.TFRecordWriter('file.tfrecords')
str = data.tostring()
example = tf.train.Example(features=tf.train.Features(feature={'1': _bytes_feature(str)}))
writer.write(example.SerializeToString())
writer.close()
然后您可以使用tf.decode_raw
对其进行解码,其中您可以使用以下命令检查tfrecord文件
for str_rec in tf.python_io.tf_record_iterator('file.tfrecords'):
example = tf.train.Example()
example.ParseFromString(str_rec)
str = (example.features.feature['1'].bytes_list.value[0])
your_data = np.fromstring(str, dtype)