我参加了广泛和深度张量流教程,并在我自己的数据集中使用它。我基本上只改变了列名和张量。我想预测浮动值“valeur”。但我一直保持0准确度。有人在乎解释原因吗?
COLUMNS = ["idPCE", "typeObj", "heure", "typeG", "pas",
"qualite", "valeur"]
LABEL_COLUMN = "label"
CATEGORICAL_COLUMNS = ["idPCE", "typeG", "pas", "qualite"]
CONTINUOUS_COLUMNS = ["heure"]
def maybe_download(train_data, test_data):
"""Maybe downloads training data and returns train and test file names."""
if train_data:
train_file_name = train_data
else:
train_file = tempfile.NamedTemporaryFile(delete=False)
urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name) # pylint: disable=line-too-long
train_file_name = train_file.name
train_file.close()
print("Training data is downloaded to %s" % train_file_name)
if test_data:
test_file_name = test_data
else:
test_file = tempfile.NamedTemporaryFile(delete=False)
urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name) # pylint: disable=line-too-long
test_file_name = test_file.name
test_file.close()
print("Test data is downloaded to %s" % test_file_name)
return train_file_name, test_file_name
def build_estimator(model_dir, model_type):
"""Build an estimator."""
# Sparse base columns.
idPCE = tf.contrib.layers.sparse_column_with_hash_bucket("idPCE", hash_bucket_size=1000)
typeG = tf.contrib.layers.sparse_column_with_keys(column_name="typeG",
keys=["DENMOY","ENETER","ETHMOY","METMOY","PCSMOY","PREMOY","TEMMOY","VOLBAL","VOLBCP","VOLBCR","VOLCAL","VOLCCU","VOLTER"])
pas = tf.contrib.layers.sparse_column_with_keys(column_name="pas",
keys=["H","J"])
qualite = tf.contrib.layers.sparse_column_with_keys(column_name="qualite",
keys=["A","AA","AD","AF","CS","M"])
# Continuous base columns.
heure = tf.contrib.layers.real_valued_column("heure")
# Transformations.
heure_buckets = tf.contrib.layers.bucketized_column(heure,
boundaries=[
6, 12, 18
])
# Wide columns and deep columns.
wide_columns = [idPCE, typeG, pas,
qualite,heure_buckets,
tf.contrib.layers.crossed_column([typeG, qualite],
hash_bucket_size=int(1e4)),
tf.contrib.layers.crossed_column(
[heure_buckets, idPCE, pas],
hash_bucket_size=int(1e6)),
tf.contrib.layers.crossed_column([heure_buckets, qualite],
hash_bucket_size=int(1e4))]
deep_columns = [
tf.contrib.layers.embedding_column(qualite, dimension=3),
tf.contrib.layers.embedding_column(pas, dimension=1),
tf.contrib.layers.embedding_column(typeG, dimension=4),
tf.contrib.layers.embedding_column(idPCE,
dimension=3),
heure
]
if model_type == "wide":
m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
feature_columns=wide_columns)
elif model_type == "deep":
m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=[100, 50])
else:
m = tf.contrib.learn.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[100, 50],
fix_global_step_increment_bug=True)
return m
def input_fn(df):
"""Input builder function."""
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
categorical_cols = {
k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols)
feature_cols.update(categorical_cols)
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
return feature_cols, label
def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
"""Train and evaluate the model."""
train_file_name, test_file_name = maybe_download(train_data, test_data)
df_train = pd.read_csv(
tf.gfile.Open(train_file_name),
sep=';',
names=COLUMNS,
skipinitialspace=True,
engine="python")
df_test = pd.read_csv(
tf.gfile.Open(test_file_name),
sep=';',
names=COLUMNS,
skipinitialspace=True,
engine="python")
# remove NaN elements
df_train = df_train.dropna(how='any', axis=0)
df_test = df_test.dropna(how='any', axis=0)
df_train[LABEL_COLUMN] = pd.to_numeric(df_train["valeur"])
df_test[LABEL_COLUMN] = pd.to_numeric(df_test["valeur"])
model_dir = tempfile.mkdtemp() if not model_dir else model_dir
print("model directory = %s" % model_dir)
m = build_estimator(model_dir, model_type)
m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)
results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
FLAGS = None
def main(_):
train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,
FLAGS.train_data, FLAGS.test_data)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_dir",
type=str,
default="",
help="Base directory for output models."
)
parser.add_argument(
"--model_type",
type=str,
default="wide_n_deep",
help="Valid model types: {'wide', 'deep', 'wide_n_deep'}."
)
parser.add_argument(
"--train_steps",
type=int,
default=200,
help="Number of training steps."
)
parser.add_argument(
"--train_data",
type=str,
default="",
help="Path to the training data."
)
parser.add_argument(
"--test_data",
type=str,
default="",
help="Path to the test data."
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
这是我的csv数据集上的一行(火车47000,测试10000):
NSVCUCOG; PCE; 05; DENMOY; H; AF; 0.619
输出:
accuracy: 0.0
accuracy/baseline_label_mean: 46.8802
accuracy/threshold_0.500000_mean: 0.0
auc: 0.5
global_step: 200
labels/actual_label_mean: 46.8802
labels/prediction_mean: 1.0
loss: -5.34063e+07
precision/positive_threshold_0.500000_mean: 0.998047
recall/positive_threshold_0.500000_mean: 1.0
答案 0 :(得分:0)
您的数据标签必须是范围,例如。 [0,1,2 ...]。
tf.contrib.learn.DNNLinearCombinedClassifier( n_classes = 5,
因为每个浮点数都被视为标签,默认类号为2。
这就是我用自己的经验解释的全部内容!