我在以下链接google colab中处理的是multi-class_classification_of_handwriting_digits
然后,我尝试以自己的方式编写代码以重新编写,提供和训练DNN。 由于csv文件没有标题,因此我无法创建特征列,因此无法训练模型。
能否请您帮我弄清楚链接中的完成方式或对我的代码的要求?预先感谢。
import pandas as pd
import seaborn as sns
import tensorflow as tf
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.columns
hand_df = mnist_df[0]
hand_df.head()
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,
num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.LinearClassifier(feature_columns=feat_cols,
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
答案 0 :(得分:0)
示例代码已经将数据集分为训练集和验证集。
我认为这与CSV中的标头没有任何关系。
training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])
validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])
所以培训代码在这里分开。
import pandas as pd
import tensorflow as tf
from tensorflow.python.data import Dataset
import numpy as np
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",sep=",",header=None)
mnist_df = mnist_df.head(10000)
dataset = mnist_df[:7500]
labels = dataset[0]
print ( labels.shape )
# DataFrame.loc index ranges are inclusive at both ends.
features = dataset.loc[:, 1:784]
print ( features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
features = features / 255
def create_training_input_fn(feature, label, batch_size, num_epochs=None, shuffle=True):
"""A custom input_fn for sending MNIST data to the estimator for training.
Args:
features: The training features.
labels: The training labels.
batch_size: Batch size to use during training.
Returns:
A function that returns batches of training features and labels during
training.
"""
def _input_fn(num_epochs=None, shuffle=True):
# Input pipelines are reset with each call to .train(). To ensure model
# gets a good sampling of data, even when number of steps is small, we
# shuffle all the data before creating the Dataset object
idx = np.random.permutation(feature.index)
raw_features = {"pixels": feature.reindex(idx)}
raw_targets = np.array(label[idx])
ds = Dataset.from_tensor_slices((raw_features, raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.LinearClassifier(feature_columns=set([tf.feature_column.numeric_column('pixels', shape=784)]),
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=create_training_input_fn(features, labels, batch_size=10),steps=1000)
类似地,您具有准备用于验证的验证集的功能。您可以按原样使用此模式。
但是,如果您使用train_test_split
拆分数据帧,则可以尝试这样做。
X_train, X_test = train_test_split(mnist_df, test_size=0.2)
您还必须对X_test
重复以下过程,以获取验证功能和标签。
X_train_labels = X_train[0]
print ( X_train_labels.shape )
# DataFrame.loc index ranges are inclusive at both ends.
X_train_features = X_train.loc[:, 1:784]
print ( X_train_features.shape )
# Scale the data to [0, 1] by dividing out the max value, 255.
X_train_features = X_train_features / 255
答案 1 :(得分:0)
我没有试图找到一种使用没有任何列名的数据的方法,而是有了一个想法:)我已经命名了所有列并将它们附加到cols=[]
上,这样很容易分配和使用feature_columns = cols
。
这是我自己的问题的完整工作代码。
谢谢。
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn import metrics
from tensorflow.python.data import Dataset
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.describe()
mnist_df.columns
hand_df = mnist_df[0]
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
hand_df.head()
#creating cols array and append a1 to a784 in order to name columns
cols=[]
for i in range(785):
if i!=0:
a = '{}{}'.format('a',i)
cols.append(a)
matrix_df.columns = cols
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
#naming columns so I will not get error while assigning feature_columns
for i in range(len(cols)):
a=i+1
b='{}{}'.format('a',a)
cols[i] = tf.feature_column.numeric_column(str(b))
matrix_df.head()
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.DNNClassifier(feature_columns=cols,
hidden_units=[32,64],
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
batch_size=50,
num_epochs=1,
shuffle=False)
pred_gen = model.predict(predict_input_func)
predictions = list(pred_gen)
predictions[0]