我有一个flask Web应用程序,需要加载scikit-learn模型。该模型由名为build_model
的函数创建,并由名为classifier1.pkl
的函数保存到pickle文件调用save_model
中。这两个函数都在train_classifier.py
python文件中。以下是此python文件的内容:
# import .....
def load_data(database_filepath):
'''
Load the cleaned dataset.
......
'''
def tokenize(text):
'''
Convert given text into tokens.
'''
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
def build_model():
'''
Construct a scikit-learn pipeline and use GridSearchCV method to
tune the pipelines hyperparameters.
return:
model: The scikit-learn pipeline model.
'''
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', MultiOutputClassifier(RandomForestClassifier()))
])
parameters = {
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000),
'tfidf__use_idf': (True, False),
'clf__estimator__n_estimators': [10, 20],
'clf__estimator__min_samples_split': [2, 3]
}
model = GridSearchCV(pipeline, param_grid=parameters,
verbose=2, return_train_score=False, n_jobs=5)
return model
def evaluate_model(model, X_test, Y_test, category_names):
'''
Use model to perform predictions
input:
model: Model using to perform predictions.
X_test: Test messages.
Y_test: True values of the categories for corresponding messages.
category_names: the name of each category
'''
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names= category_names))
def save_model(model, model_filepath):
'''
Save the model in a pickle file.
input:
model: Model to be saved.
model_filepath: the file path of the saved model.
'''
with open(model_filepath, 'wb') as f:
pickle.dump(model, f)
def main():
if len(sys.argv) == 3:
database_filepath, model_filepath = sys.argv[1:]
print('Loading data...\n DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2)
print('Building model...')
model = build_model()
print('Training model...')
model.fit(X_train, Y_train)
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)
print('Saving model...\n MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)
print('Trained model saved!')
else:
print('Please provide the filepath of the disaster messages database '
'as the first argument and the filepath of the pickle file to '
'save the model to as the second argument. \n\nExample: python '
'train_classifier.py ../data/DisasterResponse.db classifier.pkl')
if __name__ == '__main__':
main()
由build_model()
函数构造的模型是一个scikit学习pipeline
对象,该对象使用tokenize(text)
函数作为输入。然后对模型进行训练和评估,最后将模型保存到pickle文件中。
问题是当我的应用程序脚本在Heroku上加载classifier1.pkl
文件时,它显示一条 AttributeError 消息。我在本地控制台中测试了代码,它运行无误。
下面是Heroku平台的日志。
Traceback (most recent call last):
2018-12-29T12:14:09.796947+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/arbiter.py", line 583, in spawn_worker
2018-12-29T12:14:09.796949+00:00 app[web.1]: worker.init_process()
2018-12-29T12:14:09.796951+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/workers/base.py", line 129, in init_process
2018-12-29T12:14:09.796952+00:00 app[web.1]: self.load_wsgi()
2018-12-29T12:14:09.796954+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/workers/base.py", line 138, in load_wsgi
2018-12-29T12:14:09.796955+00:00 app[web.1]: self.wsgi = self.app.wsgi()
2018-12-29T12:14:09.796958+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/app/base.py", line 67, in wsgi
2018-12-29T12:14:09.796960+00:00 app[web.1]: self.callable = self.load()
2018-12-29T12:14:09.796961+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/app/wsgiapp.py", line 52, in load
2018-12-29T12:14:09.796963+00:00 app[web.1]: return self.load_wsgiapp()
2018-12-29T12:14:09.796965+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/app/wsgiapp.py", line 41, in load_wsgiapp
2018-12-29T12:14:09.796967+00:00 app[web.1]: return util.import_app(self.app_uri)
2018-12-29T12:14:09.796969+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/gunicorn/util.py", line 350, in import_app
2018-12-29T12:14:09.796970+00:00 app[web.1]: __import__(module)
2018-12-29T12:14:09.796972+00:00 app[web.1]: File "/app/app/application.py", line 37, in <module>
2018-12-29T12:14:09.796974+00:00 app[web.1]: model = joblib.load("models/classifier1.pkl")
2018-12-29T12:14:09.796976+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/sklearn/externals/joblib/numpy_pickle.py", line 598, in load
2018-12-29T12:14:09.796977+00:00 app[web.1]: obj = _unpickle(fobj, filename, mmap_mode)
2018-12-29T12:14:09.796979+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/site-packages/sklearn/externals/joblib/numpy_pickle.py", line 526, in _unpickle
2018-12-29T12:14:09.796980+00:00 app[web.1]: obj = unpickler.load()
2018-12-29T12:14:09.796982+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/pickle.py", line 1050, in load
2018-12-29T12:14:09.796984+00:00 app[web.1]: dispatch[key[0]](self)
2018-12-29T12:14:09.796986+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/pickle.py", line 1338, in load_global
2018-12-29T12:14:09.796987+00:00 app[web.1]: klass = self.find_class(module, name)
2018-12-29T12:14:09.796989+00:00 app[web.1]: File "/app/.heroku/python/lib/python3.6/pickle.py", line 1392, in find_class
2018-12-29T12:14:09.796990+00:00 app[web.1]: return getattr(sys.modules[module], name)
2018-12-29T12:14:09.797163+00:00 app[web.1]: AttributeError: module '__main__' has no attribute 'tokenize'
我的应用程序脚本的内容是:
#import ...
app = Flask(__name__)
def tokenize(text):
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
# load data
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('ResponseCategory', engine)
# load model
with open("models/classifier1.pkl", 'rb') as f:
model = pickle.load(f)
@app.route('/')
@app.route('/index')
def index():
# extract data needed for visuals
genre_counts = df.groupby('genre').count()['message']
genre_names = list(genre_counts.index)
cate_counts = df[df.columns[-36:]].sum()
cate_names = list(df.columns[-36:])
# create visuals
graphs = [
{
'data': [
Bar(
x=genre_names,
y=genre_counts
)
],
'layout': {
'title': 'Distribution of Message Genres',
'yaxis': {
'title': "Count"
},
'xaxis': {
'title': "Genre"
}
}
}
]
# encode plotly graphs in JSON
ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)]
graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder)
# render web page with plotly graphs
return render_template('master.html', ids=ids, graphJSON=graphJSON)
# web page that handles user query and displays model results
@app.route('/go')
def go():
# save user input in query
query = request.args.get('query', '')
# use model to predict classification for query
classification_labels = model.predict([query])[0]
classification_results = dict(zip(df.columns[4:], classification_labels))
# This will render the go.html Please see that file.
return render_template(
'go.html',
query=query,
classification_result=classification_results
)
if __name__ == '__main__':
app.run()
The sturcture of my project folder
我的操作系统是Ubuntu 18.04,Python版本是3.6.7。