我在Kaggle上使用Titanic数据集的二进制分类。这是代码:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
ROOT = "data"
TEST = "test.csv"
TRAIN = "train.csv"
rate = 0.01
epochs = 100
errors = []
def load_data():
train = pd.read_csv(os.path.join(ROOT, TRAIN))
test = pd.read_csv(os.path.join(ROOT, TEST))
return [train, test]
train, test = load_data()
y_train = train['Survived'].values.reshape((891, 1))
x_train = train.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], 1)
x_train['temp'] = y_train
x_train = x_train.dropna(axis=0, how='any')
y_train = x_train['temp'].values.reshape((714, 1))
x_train = x_train.drop(['temp'], 1)
x_test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], 1)
x_test = x_train.dropna(axis=0, how='any')
theta = tf.Variable(tf.zeros([len(x_train.columns), 1]))
X = tf.placeholder(tf.float32, shape=[None, len(x_train.columns)])
y = tf.placeholder(tf.float32, shape=[None, 1])
b = tf.Variable(0.0)
model = tf.sigmoid(tf.matmul(X, theta) + b)
cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y,logits=model))
optimizer = tf.train.GradientDescentOptimizer(rate).minimize(cost)
init = [tf.global_variables_initializer(), tf.local_variables_initializer()]
with tf.Session() as sess:
sess.run(init)
for e in range(epochs):
sess.run(optimizer, feed_dict={X:x_train, y:y_train})
loss = sess.run(cost, feed_dict={X:x_train, y:y_train})
print("cost at step", e, loss)
errors.append(loss)
theta = np.array(sess.run(theta))
b = np.array(sess.run(b))
我开始清理数据(我知道我可以使用数字表示,但我试图让自己变得容易)并将其分成测试和训练。我使用了sigmoid成本函数,但由于某种原因,成本在100个时代之后永远不会改变。我不知道问题是我的成本函数还是别的。