我正在尝试使用select和terms使用Pandas从HDFStore检索存储的数据。一个简单的select(),没有术语,返回所有数据。但是,当我尝试根据DateTimeIndex过滤数据时,会返回但最后一行的所有内容。
我怀疑有关时间戳如何在内部存储以及它们的精确度有些可疑,但我不明白为什么它不起作用或我能做些什么。任何指针都会有所帮助,因为我对此很陌生。
我已经创建了一个小的“单元测试”来调查......
import os
import tempfile
import uuid
import pandas as pd
import numpy as np
import time
import unittest
import sys
class PandasTestCase(unittest.TestCase):
def setUp(self):
print "Pandas version: {0}".format(pd.version.version)
print "Python version: {0}".format(sys.version)
self._filename = os.path.join(tempfile.gettempdir(), '{0}.{1}'.format(str(uuid.uuid4()), 'h5'))
self._store = pd.HDFStore(self._filename)
def tearDown(self):
self._store.close()
if os.path.isfile(self._filename):
os.remove(self._filename)
def test_filtering(self):
t_start = time.time() * 1e+9
t_end = t_start + 1e+9 # 1 second later, i.e. 10^9 ns
sample_count = 1000
timestamps = np.linspace(t_start, t_end, num=sample_count).tolist()
data = {'channel_a': range(sample_count)}
time_index = pd.to_datetime(timestamps, utc=True, unit='ns')
df = pd.DataFrame(data, index=time_index, dtype=long)
key = 'test'
self._store.append(key, df)
retrieved_df = self._store.select(key)
retrieved_timestamps = np.array(retrieved_df.index.values, dtype=np.uint64).tolist()
print "Retrieved {0} timestamps, w/o filter.".format(len(retrieved_timestamps))
self.assertItemsEqual(retrieved_timestamps, timestamps)
stored_time_index = self._store[key].index
# Create a filter based on first and last values of index, i.e. from <= index <= to.
from_filter = pd.Term('index>={0}'.format(pd.to_datetime(stored_time_index[0], utc=True, unit='ns')))
to_filter = pd.Term('index<={0}'.format(pd.to_datetime(stored_time_index[-1], utc=True, unit='ns')))
retrieved_df_interval = self._store.select(key, [from_filter, to_filter])
retrieved_timestamps_interval = np.array(retrieved_df_interval.index.values, dtype=np.uint64).tolist()
print "Retrieved {0} timestamps, using filter".format(len(retrieved_timestamps_interval))
self.assertItemsEqual(retrieved_timestamps_interval, timestamps)
if __name__ == '__main__':
unittest.main()
...输出以下内容:
Pandas version: 0.12.0
Python version: 2.7.3 (default, Apr 10 2013, 06:20:15)
[GCC 4.6.3]
Retrieved 1000 timestamps, w/o filter.
Retrieved 999 timestamps, using filter
F
======================================================================
FAIL: test_filtering (__main__.PandasTestCase)
----------------------------------------------------------------------
Traceback (most recent call last):
File "pandastest.py", line 53, in test_filtering
self.assertItemsEqual(retrieved_timestamps_interval, timestamps)
AssertionError: Element counts were not equal:
First has 1, Second has 0: 1.377701660170978e+18
----------------------------------------------------------------------
Ran 1 test in 0.039s
FAILED (failures=1)
Process finished with exit code 1
更新:修改术语的创建后,使用备用构造函数,一切正常。像这样:
# Create a filter based on first and last values of index, i.e. from <= index <= to.
#from_filter = pd.Term('index>={0}'.format(pd.to_datetime(stored_time_index[0], utc=True, unit='ns')))
from_filter = pd.Term('index','>=', stored_time_index[0])
#to_filter = pd.Term('index<={0}'.format(pd.to_datetime(stored_time_index[-1], utc=True, unit='ns')))
to_filter = pd.Term('index','<=', stored_time_index[-1])
答案 0 :(得分:2)
时间戳上的字符串格式默认为小数点后6位(这是您在Term上的格式所做的)
ns是9个位置,使用Term构造函数的替代形式
Term("index","<=",stamp)
这是一个例子
In [2]: start = Timestamp('20130101 9:00:00')
In [3]: start.value
Out[3]: 1357030800000000000
In [5]: index = pd.to_datetime([ start.value + i for i in list(ran
Out[8]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2013-01-01 09:00:00, ..., 2013-01-01 09:00:00.000000999]
Length: 1000, Freq: None, Timezone: None
In [9]: df = DataFrame(randn(1000,2),index=index)
In [10]: df.to_hdf('test.h5','df',mode='w',fmt='t')
In [12]: pd.read_hdf('test.h5','df')
Out[12]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2013-01-01 09:00:00 to 2013-01-01 09:00:00
Data columns (total 2 columns):
0 1000 non-null values
1 1000 non-null values
dtypes: float64(2)
In [15]: pd.read_hdf('test.h5','df',where=[pd.Term('index','<=',index[-1])])
Out[15]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2013-01-01 09:00:00 to 2013-01-01 09:00:00
Data columns (total 2 columns):
0 1000 non-null values
1 1000 non-null values
dtypes: float64(2)
In [16]: pd.read_hdf('test.h5','df',where=[pd.Term('index','<=',index[-1].value-1)])
Out[16]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 999 entries, 2013-01-01 09:00:00 to 2013-01-01 09:00:00
Data columns (total 2 columns):
0 999 non-null values
1 999 non-null values
dtypes: float64(2)
请注意,在0.13(此示例使用master)中,这将更容易(并且您可以直接包含它:'index<=index[-1]'
(表达式的rhs上的索引实际上是局部变量索引