我正在使用Kaggle Rossmann dataset训练一个宽而深的模型。代码与本教程中给出的代码非常相似。我只是更改用于建模的数据。
我正在使用的代码如下:
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import shutil
import sys
import tempfile
import pandas as pd
from six.moves import urllib
import tensorflow as tf
CSV_COLUMNS = [
'Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo',
'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
'CompetitionDistance', 'trend', 'Max_TemperatureC', 'Mean_TemperatureC',
'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity'
]
StateHoliday = tf.feature_column.categorical_column_with_vocabulary_list(
"StateHoliday", ["True", "False"])
StoreType = tf.feature_column.categorical_column_with_vocabulary_list(
"StoreType", ['c', 'a', 'd', 'b'])
Assortment = tf.feature_column.categorical_column_with_vocabulary_list(
"Assortment", ['c', 'a', 'b'])
CompetitionDistance = tf.feature_column.categorical_column_with_hash_bucket(
"CompetitionDistance", hash_bucket_size=1000)
Customers = tf.feature_column.categorical_column_with_hash_bucket(
"Customers", hash_bucket_size=1000)
Store = tf.feature_column.categorical_column_with_hash_bucket(
"Store", hash_bucket_size=1000)
trend = tf.feature_column.numeric_column("trend")
Max_TemperatureC = tf.feature_column.numeric_column("Max_TemperatureC")
Mean_TemperatureC = tf.feature_column.numeric_column("Mean_TemperatureC")
Min_TemperatureC = tf.feature_column.numeric_column("Min_TemperatureC")
Max_Humidity = tf.feature_column.numeric_column("Max_Humidity")
Mean_Humidity = tf.feature_column.numeric_column("Mean_Humidity")
Min_Humidity = tf.feature_column.numeric_column("Min_Humidity")
crossed_columns = [
tf.feature_column.crossed_column(
["Assortment", "StoreType"], hash_bucket_size=1000)
]
deep_columns = [
tf.feature_column.indicator_column("DayOfWeek"),
tf.feature_column.indicator_column("Open"),
tf.feature_column.indicator_column("Promo"),
tf.feature_column.indicator_column("StateHoliday"),
tf.feature_column.indicator_column("SchoolHoliday"),
tf.feature_column.indicator_column("StoreType"),
tf.feature_column.indicator_column("Assortment"),
# To show an example of embedding
tf.feature_column.embedding_column("CompetitionDistance", dimension=8),
tf.feature_column.embedding_column("Customers", dimension=8),
tf.feature_column.embedding_column("Store", dimension=8),
trend,
Max_TemperatureC,
Mean_TemperatureC,
Min_TemperatureC,
Max_Humidity,
Mean_Humidity,
Min_Humidity
]
def build_estimator(model_dir):
"""Build an estimator."""
m = tf.estimator.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=crossed_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[100, 50])
return m
def input_fn(data_file, num_epochs, shuffle):
df_data = pd.read_csv(
"D:/Rossmann/Rossmann_Data/" + data_file + ".csv",
names=CSV_COLUMNS,
skipinitialspace=True,
engine="python",
skiprows=1)
# remove NaN elements
df_data = df_data.dropna(how="any", axis=0)
print(df_data.dtypes)
df_data = df_data.sort(['Sales'], ascending=[True])
labels = df_data["Sales"].apply(lambda x: 1 if x >= 20000 else 0)
return tf.estimator.inputs.pandas_input_fn(
x=df_data,
y=labels,
batch_size=100,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model_dir = "D:/Rossmann/Rossmann_Data"
m = build_estimator(model_dir)
m.train(
input_fn=input_fn("df1", num_epochs=None, shuffle=True),
steps=2000)
但不幸的是我收到了以下错误。
Traceback (most recent call last):
File "timeSeriesPredictionUsingEmbedding2.py", line 121, in <module>
steps=2000)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 630, in _train_model
model_fn_lib.ModeKeys.TRAIN)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 615, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn_linear_combined.py", line 395, in _model_fn
config=config)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn_linear_combined.py", line 156, in _dnn_linear_combined_model_fn
feature_columns=dnn_feature_columns)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 207, in input_layer
_check_feature_columns(feature_columns)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 1662, in _check_feature_columns
if column.name in name_to_column:
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 2453, in name
return '{}_indicator'.format(self.categorical_column.name)
AttributeError: 'str' object has no attribute 'name'
你能指导我收到这个错误的地方吗?当我运行你的代码时,它运行得很好。
谢谢!
答案 0 :(得分:1)
原因是tf.feature_column.indicator_column
接受分类列实例,而不是列名(“DayOfWeek”,“Open”等)。可能这个API在之前的tf版本中看起来有所不同,我不确定,但是现在你必须创建一个categorical_column_*
然后用指示符换行。
顺便说一句,我发现您使用的是DataFrame.sort
- 此方法已被弃用,不再适用于最新的pandas
。使用sort_values
。
<强>更新强>
我没有注意到代码是tutorial example的改编版,这就是为什么它故意使用所有可能的特征类型,散列,嵌入,交叉列。通常,人们不必一次使用所有这些,特别是罗斯曼数据不需要。你可以进一步添加,例如,交叉列,如果你注意到数据中的特征相关性,但只是为了开始,数据大多是numeric_column
和少categorical_column_with_vocabulary_list
个
以下是此代码的完整版本:
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import tensorflow as tf
CSV_COLUMNS = [
'Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo',
'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
'CompetitionDistance', 'trend', 'Max_TemperatureC', 'Mean_TemperatureC',
'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity'
]
Store = tf.feature_column.numeric_column("Store")
DayOfWeek = tf.feature_column.numeric_column("DayOfWeek")
Customers = tf.feature_column.numeric_column("Customers")
Open = tf.feature_column.numeric_column("Open")
Promo = tf.feature_column.numeric_column("Promo")
StateHoliday = tf.feature_column.categorical_column_with_vocabulary_list("StateHoliday", ["True", "False"])
SchoolHoliday = tf.feature_column.numeric_column("SchoolHoliday")
StoreType = tf.feature_column.categorical_column_with_vocabulary_list("StoreType", ['a', 'b', 'c', 'd'])
Assortment = tf.feature_column.categorical_column_with_vocabulary_list("Assortment", ['a', 'b', 'c'])
CompetitionDistance = tf.feature_column.numeric_column("CompetitionDistance")
trend = tf.feature_column.numeric_column("trend")
Max_TemperatureC = tf.feature_column.numeric_column("Max_TemperatureC")
Mean_TemperatureC = tf.feature_column.numeric_column("Mean_TemperatureC")
Min_TemperatureC = tf.feature_column.numeric_column("Min_TemperatureC")
Max_Humidity = tf.feature_column.numeric_column("Max_Humidity")
Mean_Humidity = tf.feature_column.numeric_column("Mean_Humidity")
Min_Humidity = tf.feature_column.numeric_column("Min_Humidity")
deep_columns = [
Store,
DayOfWeek,
Customers,
Open,
Promo,
tf.feature_column.indicator_column(StateHoliday),
SchoolHoliday,
tf.feature_column.indicator_column(StoreType),
tf.feature_column.indicator_column(Assortment),
CompetitionDistance,
trend,
Max_TemperatureC,
Mean_TemperatureC,
Min_TemperatureC,
Max_Humidity,
Mean_Humidity,
Min_Humidity
]
def build_estimator(model_dir):
"""Build an estimator."""
return tf.estimator.DNNLinearCombinedClassifier(
model_dir=model_dir,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[100, 50])
def input_fn(data_file, num_epochs, shuffle):
df_data = pd.read_csv(data_file + ".csv",
names=CSV_COLUMNS,
dtype={"StateHoliday": str},
skipinitialspace=True,
engine="python",
skiprows=1)
# remove NaN elements
df_data = df_data.dropna(how="any", axis=0)
df_data = df_data.sort_values(['Sales'], ascending=[True])
labels = df_data["Sales"].apply(lambda x: 1 if x >= 20000 else 0)
return tf.estimator.inputs.pandas_input_fn(
x=df_data,
y=labels,
batch_size=100,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
m = build_estimator(model_dir="./model")
m.train(input_fn=input_fn("df1", num_epochs=None, shuffle=True),
steps=2000)