我试图声明一个新的pandas.DataFrame列,并使用pandas.Rolling.apply()对它进行一些数学运算,但是在特殊情况下,除非我先声明该新列,否则会出现KeyError。例如:
import pandas as pd
import numpy as np
df2 = pd.DataFrame(df1['float_column'])
df2_roll = df2.rolling(10, min_periods=2)
df2['new_col_0'] = df2_roll.apply(np.nansum, raw=False)
df2['new_col_1'] = df2['new_col_0'] / df2_roll.apply(np.nanstd, raw=False)
将在最后一行失败,并显示KeyError:“ new_col_1”,但这是
df2 = pd.DataFrame(df1['float_column'])
df2_roll = df2.rolling(10, min_periods=2)
df2['new_col_0'] = df2_roll.apply(np.nansum, raw=False)
df2['new_col_1'] = None
df2['new_col_1'] = df2['new_col_0'] / df2_roll.apply(np.nanstd, raw=False)
运行不会有问题。
对于上下文,我使用的是python 3.7.1 venv和pandas 0.24.0和numpy 1.16.4。我已经能够在相同环境中运行代码而没有任何错误。
这是该方法的最新版本(没有解决方法):
def calc_momentum(data: pd.DataFrame, periods: int = 25) -> pd.DataFrame:
"""
Makes 2 calculations for momentum and returns them as a pandas.DataFrame.
:param data: A pandas.DataFrame containing a 'Log_Return' column.
:param periods: The number of periods to pull history from. Must be an integer greater or equal to 2.
:return: A pandas.DataFrame with 2 columns ('Momentum_1', 'Momentum_2') with each type of calculation.
"""
assert 'Log_Return' in data.columns
result = pd.DataFrame(data['Log_Return'])
# Need to sort ascending for this to work since pandas.DataFrame.rolling is a one-way operation.
result.sort_index(ascending=True, inplace=True)
assert result.index[0] < result.index[-1]
assert periods >= 2 # pandas requires that the rolling window is at least as large as min_periods.
rolling_period = result.rolling(periods, min_periods=2)
result['Momentum_1'] = rolling_period.apply(np.nansum, raw=False)
assert 'Momentum_1' in result.columns
# Workaround/Bug: For whatever reason, declaring Momentum_2 and dividing by a rolling apply to avoid KeyErrors.
# result['Momentum_2'] = None # workaround commented out
result['Momentum_2'] = result['Momentum_1'] / rolling_period.apply(np.nanstd, raw=False) # raises KeyError here
assert 'Momentum_2' in result.columns
del result['Log_Return']
assert 'Log_Return' not in result.columns
# Sorted descending again to match our other features.
result.sort_index(ascending=False, inplace=True)
assert result.index[0] > result.index[-1]
return result
这是该方法的旧版本,可以运行:
def get_momentum(raw_daily_ticker_data, period=25):
# This returns the momentum for a given stock on a certain date using the daily returns.
# Momentum captures what the trend of the stock's movement has been over the last N trading days (N = period).
returns, log_returns = get_log_returns(raw_daily_ticker_data)
rolling_returns = log_returns['LogOfReturn'].rolling(period, min_periods=2)
log_returns['Momentum_1'] = rolling_returns.apply(np.nansum, raw=False)
log_returns['Momentum_2'] = log_returns['Momentum_1'] / rolling_returns.apply(np.nanstd, raw=False)
# return the dataframe in descending order
log_returns.sort_index(ascending=False, inplace=True)
return log_returns
这是我正在使用它们的单元测试(有一些小的变量更改):
def test_calc_momentum(self) -> None:
test_runs = np.random.randint(5, 300)
for i in range(test_runs):
n_values = np.random.randint(150, 600) # Approximately 0.5-2 years' of trading days.
test_values = np.random.randn(n_values) * 0.01
with_log_return = pd.DataFrame({'Return': test_values}) # old version
# pd.DataFrame({'Log_Return': test_values}) # latest version
test_periods = np.random.randint(2, n_values)
get_momentum(with_log_return, period=test_periods) # old version
# calc_momentum(with_log_return, periods=test_periods) # latest version
del with_log_return, test_values
zero_test = pd.DataFrame({'Log_Return': np.random.randn(10) * 0.01})
with self.assertRaises(AssertionError):
calc_momentum(zero_test, periods=0)
del zero_test
empty = pd.DataFrame()
with self.assertRaises(AssertionError):
calc_momentum(empty)
del empty
最后,这是在单元测试中运行最新版本时遇到的错误:
Ran 1 test in 0.166s
FAILED (errors=1)
Error
Traceback (most recent call last):
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Momentum_2'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\internals\managers.py", line 1053, in set
loc = self.items.get_loc(item)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\indexes\base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Momentum_2'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\<user>\AppData\Local\Programs\Python\Python37-32\lib\unittest\case.py", line 59, in testPartExecutor
yield
File "C:\Users\<user>\AppData\Local\Programs\Python\Python37-32\lib\unittest\case.py", line 615, in run
testMethod()
File "D:\Projects\<project>\test\test_features.py", line 142, in test_calc_momentum
calc_momentum(with_log_return, periods=test_periods) # latest version
File "D:\Projects\<project>\app\features.py", line 228, in calc_momentum
result['Momentum_2'] = result['Momentum_1'] / rolling_period.apply(np.nanstd, raw=False)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\frame.py", line 3370, in __setitem__
self._set_item(key, value)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\frame.py", line 3446, in _set_item
NDFrame._set_item(self, key, value)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\generic.py", line 3172, in _set_item
self._data.set(key, value)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\internals\managers.py", line 1056, in set
self.insert(len(self.items), item, value)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\internals\managers.py", line 1158, in insert
placement=slice(loc, loc + 1))
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\internals\blocks.py", line 3095, in make_block
return klass(values, ndim=ndim, placement=placement)
File "D:\Projects\<project>\venv\lib\site-packages\pandas\core\internals\blocks.py", line 87, in __init__
'{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
ValueError: Wrong number of items passed 201, placement implies 1
Assertion failed
Assertion failed
Process finished with exit code 1
Assertion failed
Assertion failed
Assertion failed
Assertion failed
旧版本使用相同的代码行,没有问题,可以执行分配。为什么最新版本寻找新列的键?