我收到此错误:
ValueError:feature_columns的项必须是_FeatureColumn。特定 (类型):索引([' CreditScore', ' Age',' Tenure',' Balance',' NumOfProducts',' HasCrCard', ' IsActiveMember',' EstimatedSalary','退出'], D型='对象'。)
我正在使用tensorFlow lib。我想获得预测结果,但我无法运行m.train(input_fn=get_input_fn ,steps=5000)
代码。无论我做什么,我总是得到同样的错误。我在下面使用了这些输入函数,但没有改变。
def input_fn_train():
x=tf.constant(df_train.astype(np.float64)),
y=tf.constant(df_train[LABEL].astype(np.float64))
return x, y
和
def get_input_fn(data_set, num_epochs=None, shuffle=False):
return tf.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in data_set.columns}),
y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs,
shuffle=shuffle)
我无法理解我该怎么做。错误是什么?我一直在谷歌搜索,但从未找到有用的东西。我该如何处理这个错误。代码如下。谢谢!
import pandas as pd
import tensorflow as tf
import numpy as np
import tempfile
COLS= ["RowNumber","CustomerId","Surname","CreditScore","Geography",
"Gender","Age","Tenure","Balance","NumOfProducts","HasCrCard",
"IsActiveMember","EstimatedSalary","Exited"]
FEATURES = ["CreditScore","Age","Tenure","Balance","NumOfProducts",
"HasCrCard","IsActiveMember", "EstimatedSalary"]
LABEL="Exited"
df_train = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
df_test = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
test_label = df_test[LABEL].astype(float)
df_test.drop("Surname", axis = 1, inplace=True)
df_test.drop("RowNumber", axis = 1, inplace=True)
df_test.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("Surname", axis = 1, inplace=True)
df_train.drop("RowNumber", axis = 1, inplace=True)
df_train.drop("Geography", axis = 1, inplace=True)
df_train.drop("Gender", axis = 1, inplace=True)
def get_input_fn():
return {'x': tf.constant(df_train[FEATURES].as_matrix(), tf.float32,
df_train.shape),
'y': tf.constant(df_train[LABEL].as_matrix(), tf.float32,
df_train.shape)
}
df=df_train.select_dtypes(exclude=['object'])
numeric_cols=df.columns
m = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=
[numeric_cols])
m.train(input_fn=get_input_fn ,steps=5000)
results = m.evaluate(input_fn= get_input_fn(df_test, num_epochs=1,
shuffle=False),steps=None)
y = m.predict(input_fn=get_input_fn(df_test, num_epochs=1, shuffle=False))
pred = list(y)
rowNumber=0
for i in pred:
print(str(rowNumber)+': '+str(pred[i]))
rowNumber=rowNumber+1
答案 0 :(得分:5)
您的第一个错误是如何创建tf.estimator.LinearClassifier
。您已将数据框索引df.columns
传递到feature_columns
,但应传递tensorflow feature columns列表。列应该定义它是数字还是分类,在后一种情况下是编码类型。
其次,输入功能可以简化很多,因为您正在阅读pandas
数据帧。只需使用tf.estimator.inputs.pandas_input_fn
。
你的.csv
很可能与众不同,我制作了一个有一些价值的假人。因此,这是一种读取输入并正确拟合模型的方法:
import pandas as pd
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
"HasCrCard", "IsActiveMember", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
has_card = tf.feature_column.categorical_column_with_vocabulary_list("HasCrCard", ["True", "False"])
is_active_member = tf.feature_column.categorical_column_with_vocabulary_list("IsActiveMember", ["True", "False"])
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, has_card, is_active_member, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
df = pd.read_csv('Churn_Modelling.csv',
names=FEATURES,
dtype={'HasCrCard': str, 'IsActiveMember': str},
skipinitialspace=True,
header=0)
df = df.dropna(how='any', axis=0) # remove NaN elements
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir=None,
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)
答案 1 :(得分:1)
它工作得很清楚。
import pandas as pd
import tensorflow as tf
import tempfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
def split_data(data, rate, label):
data = data.dropna()
train_data, test_data = train_test_split(data, test_size=rate)
train_label = train_data[label]
train_data = train_data.drop(label, 1)
test_label = test_data[label]
test_data = test_data.drop(label, 1)
return train_data, train_label, test_data, test_label
LABEL = "Exited"
data = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
data.drop("Surname", axis=1, inplace=True)
data.drop("RowNumber", axis=1, inplace=True)
data.drop("CustomerId", axis=1, inplace=True)
data.drop("Geography", axis=1, inplace=True)
data.drop("Gender", axis=1, inplace=True)
x_train, y_train, x_test, y_test = split_data(data, 0.20, LABEL)
def get_input_fn_train():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_train,
y=y_train,
shuffle=False
)
return input_fn
def get_input_fn_test():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_test,
y=y_test,
shuffle=False
)
return input_fn
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn
(get_input_fn_train())
model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(model_dir=model_dir,
feature_columns=feature_columns)
# train data
m.train(input_fn=get_input_fn_train(), steps=5000)
# you can get accuracy, accuracy_baseline, auc, auc_precision_recall,
#average_loss, global_step, label/mean, lossprediction/mean
results = m.evaluate(input_fn=get_input_fn_test(), steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
# get prediction results
y = m.predict(input_fn=get_input_fn_test())
predictions = list(y)
pred1=pd.DataFrame(data=predictions)
prediction=pd.DataFrame(data=pred1['class_ids'])
pred=[]
for row in prediction["class_ids"]:
pred.append(row[0])
rowNumber = 0
for i in pred:
print(str(rowNumber) + ': ' + str(i))
rowNumber = rowNumber + 1
def calculate(prediction, LABEL):
arr = {"accuracy": accuracy_score(prediction, LABEL),
"report": classification_report(prediction, LABEL),
"Confusion_Matrix": confusion_matrix(prediction, LABEL),
"F1 score": f1_score(prediction, LABEL),
"Recall Score": recall_score(prediction, LABEL),
"cohen_kappa": cohen_kappa_score(prediction, LABEL)
}
return arr
pred2 = pd.DataFrame(data=pred)
print(calculate(pred2.round(), y_test))
答案 2 :(得分:0)
我将对@Maxim的答案进行一些小的更改(谢谢,顺便说一句),并发布一个包含随机numpy数据的最小工作示例。这似乎在我的Windows计算机上运行良好。请注意suppressed warning,因为我使用的是特定的硬件。
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
N_features = len(FEATURES)
print(N_features)
N_examples = 5000
X_train = np.random.rand(N_examples,N_features)
Y_train = np.random.rand(N_examples)
columns = [str(i) for i in range(N_features)]
columns = FEATURES
df = pd.DataFrame(data = X_train, columns = columns)
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir='model_dir',
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)