我一直在尝试学习HMM,因此我选择了一个有趣的问题并开始进行研究。本质上,我有一些时间戳记的地理坐标,并且鉴于先前观察到的行为,我想根据当前位置预测最可能的下一个位置。我正在使用hmmlearn在python中执行此操作。这是到目前为止的代码,由于我已经使用教程进行了几天的调试,因此有点粗糙:
from common import load_file_forcaster
import numpy as np
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM
class Forecaster:
def __init__(self, file_name, n_hidden_states=4, test_size_mult=0.5, shuf=False, preprocess=True, latency_days=5):
tupe = load_file_forcaster(file_name)
if preprocess:
tupe = self.preprocess(tupe)
# tupe sub zero is the time in sorted order of occurance
self._X = tupe[0]
# Y tupe sub 1 is the actual points of the location in x,y,z format ftm
self._Y = tupe[1]
self._train_data, self._test_data = train_test_split(
self._Y, test_size=test_size_mult, shuffle=shuf)
#print('test followed train ' + str())
self._hmm = GaussianHMM(n_components=n_hidden_states)
self.n_latency_days = latency_days
def test_data(self):
return self._test_data
def fit_data(self):
first_column = self._train_data[:,0]
second_column = self._train_data[:,1]
third_column = self._train_data[:,2]
feature_vector = np.column_stack((first_column, second_column, third_column))
self._hmm.fit(feature_vector)
def preprocess(self, tupe):
#print('Before sort ' + str(list(zip(tupe[1], tupe[0]))))
order = np.argsort(tupe[1])
pnts_sorted = np.array(tupe[0])[order, :]
ordered_times = np.sort(tupe[1])
print('Shape ' + str(pnts_sorted.shape))
print('Ordered_times ' + str(list(zip(ordered_times, pnts_sorted))))
return (ordered_times, pnts_sorted)
def find_single_likly(self, day_index=10):
previous_data_start_index = max(0, day_index - self.n_latency_days)
previous_data_end_index = max(0, day_index - 1)
previous_data = self._test_data[previous_data_start_index:previous_data_end_index]
outcome_list = []
count = 0
for location in self._Y:
count = count + 1
#print(str(count) + ' Prev data is ' + str(previous_data) + ' location is ' + str(location))
observation = np.row_stack((previous_data, location))
#print('Observation ' + str(observation))
score = self._hmm.score(observation)
if score < 0:
score = np.exp(score)
outcome_list.append(score)
#print('Outcome list is ' + str(outcome_list))
most_probable_outcome = self._Y[np.argmax(outcome_list)]
final_eval = np.row_stack((previous_data, most_probable_outcome))
final_score = self._hmm.score(final_eval)
if final_score < 0:
final_score = np.exp(final_score)
return (previous_data, final_score, most_probable_outcome)
def main():
print('Starting')
f = Forecaster('geo_small_csv.txt')
f.fit_data()
for i in range(8, len(f.test_data())):
tupe = f.find_single_likly(i)
print('Predicted for ' + str(tupe[0]) + ' is probability ' + str(tupe[1]) + ' with answer ' + str(tupe[2]))
if i + 1 < len(f.test_data()):
print('Next outcome is ' + str(f._test_data[i+1]) + ' match is ' + str(f._test_data[i+1] == tupe[2]))
print('Done')
if __name__== '__main__':
main()
此外,此存储库包含我一直在使用的代码和数据: https://github.com/joshu0991/geo
我看到的是,当我评估模型的正确性时,概率很小,并且对下一点的预测通常永远不会正确。我的问题是,我的方法看起来正确吗?具体来说,我使用3d向量作为数据,因为我使用以下方法将经度和纬度转换为3d空间:
x = cos(lat) * cos(long)
y = sin(lat) * sin(long)
z = sin(lat)
如果这不是很严重的错误,那么我显然做错了什么吗?我认为HMM是基于我已阅读的几篇不同论文观察人类运动时的正确模型选择,特别是使用Mobility Markov Chains的Next Place Prediction,它开发的方法不同于HMM,但相距并不远,并且可以预测未来隐马尔可夫模型的位置。任何建议或正确方向的建议都将受到赞赏。