我有这个DataFrame“ dfSummary”-
exchangeBalances = [['ETHBTC','binance',10], ['LTCBTC','binance',10], ['XRPBTC','binance',10], ['ETHBTC','bitfinex',10], ['LTCBTC','bitfinex',10], ['XRPBTC','bitfinex',10]]
bidOffers = [
['ETHBTC','binance', 0.0035, 0.0351, datetime(2018, 9, 1, 8, 15)], ['LTCBTC','binance',0.009,0.092, datetime(2018, 9, 1, 8, 15)], ['XRPBTC','binance',0.000077, 0.000078, datetime(2018, 9, 1, 8, 15)], ['ETHBTC','bitfinex', 0.003522, 0.0353, datetime(2018, 9, 1, 8, 15)], ['LTCBTC','bitfinex',0.0093,0.095, datetime(2018, 9, 1, 8, 15)], ['XRPBTC','bitfinex',0.000083, 0.000085, datetime(2018, 9, 1, 8, 15)],
['ETHBTC','binance', 0.0035, 0.0351, datetime(2018, 9, 1, 8, 30)], ['LTCBTC','binance',0.009,0.092, datetime(2018, 9, 1, 8, 30)], ['XRPBTC','binance',0.000077, 0.000078, datetime(2018, 9, 1, 8, 30)], ['ETHBTC','bitfinex', 0.003522, 0.0353, datetime(2018, 9, 1, 8, 30)], ['LTCBTC','bitfinex',0.0093,0.095, datetime(2018, 9, 1, 8, 30)], ['XRPBTC','bitfinex',0.000083, 0.000085, datetime(2018, 9, 1, 8, 30)],
['ETHBTC','binance', 0.0035, 0.0351, datetime(2018, 9, 1, 8, 45)], ['LTCBTC','binance',0.009,0.092, datetime(2018, 9, 1, 8, 45)], ['XRPBTC','binance',0.000077, 0.000078, datetime(2018, 9, 1, 8, 45)], ['ETHBTC','bitfinex', 0.003522, 0.0353, datetime(2018, 9, 1, 8, 45)], ['LTCBTC','bitfinex',0.0093,0.095, datetime(2018, 9, 1, 8, 45)], ['XRPBTC','bitfinex',0.000083, 0.000085, datetime(2018, 9, 1, 8, 45)]
]
dfExchangeBalances = pd.DataFrame(exchangeBalances, columns=['symbol','exchange','balance'])
dfBidOffers = pd.DataFrame(bidOffers, columns=['symbol','exchange','bid', 'offer', 'created'])
dfBidOffers["spread"] = dfBidOffers["bid"] - dfBidOffers["offer"]
dfSummary = dfExchangeBalances.merge(dfBidOffers, how='left', on=['symbol','exchange'])
我需要完成的是,在“ dfSummary”中添加一个计算字段:
currentRow["Spread"] - someOtherRow["Spread"]
“ someOtherRow” 是基于“创建”(例如,具有相同{symbol,exchange}但在30分钟前“创建”的最后一行的查找)( “ currentRow”)
说明:上面的示例是对当前实际问题的简化。间隔时间不完全是15分钟。实际上,我需要在DataFrame中查找相应的记录(相同的键= {symbol,exchange}),但是首先在第一个月,第四季度和第一年创建这样的记录。
我正在尝试避免手动循环遍历DataFrame.iter并使用内置在查找中的熊猫(向量化)
我在考虑DataFrame。查找 Vectorized look-up of values in Pandas dataframe 但不确定如何从计算字段 ...的上下文中使用它吗?另外,我不想针对不同 DataFrame查找,而是希望针对相同 DataFrame
查找谢谢!
向量化(Pandas and Numpy-vs looping):
https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6
https://www.datascience.com/blog/straightening-loops-how-to-vectorize-data-aggregation-with-pandas-and-numpy/
https://realpython.com/numpy-array-programming/
答案 0 :(得分:1)
我明白了,这是我的 real 代码(因此,我并未发布所有内容)。这将起作用(但不确定是否以最快的方式实现)。
我正在使用 DataFrame.apply 。这是不 矢量化的方式,但比在python中循环要快得多。有人可以请您阐明如何完全以 vectorized 方式重写下面的内容吗?
参考本文-https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6
...我无法全神贯注地以 vectorized 方式进行重写,并且鉴于查找的性质,我开始感到下面的内容无法进行 vectorized (很高兴你们中的一个可以证明我错了):
pdPnl = pd.DataFrame.from_records([ObjectUtil.objectPropertiesToDictionary(pnl) for pnl in profitLosses], columns=ObjectUtil.objectPropertiesToDictionary(profitLosses[0]).keys())
pdPnl["TM1"] = pdPnl.apply(lambda rw : rw["COB"] - timedelta(days=1) , axis=1)
pdPnl["MonthStart"] = pdPnl.apply(lambda rw : rw["COB"].replace(day=1), axis=1)
pdPnl["QuarterStart"] = pdPnl.apply(lambda rw : DateTimeUtil.getQuarterStart(rw["COB"], rw["COB"].year), axis=1)
pdPnl["YearStart"] = pdPnl.apply(lambda rw : datetime(rw["COB"].year, 1, 1), axis=1)
pdPnl["DTDRealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["TM1"], rw["InceptionRealizedPnl"], "InceptionRealizedPnl"), axis=1)
pdPnl["DTDUnrealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["TM1"], rw["InceptionUnrealizedPnl"], "InceptionUnrealizedPnl"), axis=1)
pdPnl["MTDRealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["MonthStart"], rw["InceptionRealizedPnl"], "InceptionRealizedPnl"), axis=1)
pdPnl["MTDUnrealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["MonthStart"], rw["InceptionUnrealizedPnl"], "InceptionUnrealizedPnl"), axis=1)
pdPnl["YTDRealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["YearStart"], rw["InceptionRealizedPnl"], "InceptionRealizedPnl"), axis=1)
pdPnl["YTDUnrealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["YearStart"], rw["InceptionUnrealizedPnl"], "InceptionUnrealizedPnl"), axis=1)
pdPnl["SharpeRatio"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeSharpeRatio(pdPnl, rw["COB"]), axis=1)
pdPnl["MaxDrawDown"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeMaxDrawDown(pdPnl, rw["COB"]), axis=1)
pnlDict = pdPnl.to_dict() # Then convert back to List of ProfitLoss (Slow...)
查找功能是:
@staticmethod
def lookUpRow(pdPnl, cob):
return pdPnl[pdPnl["COB"]==cob]
@staticmethod
def computeField(pdPnl, cob, todaysPnl, targetField):
val = np.nan
otherRow = PnlCalculatorBase.lookUpRow(pdPnl, cob)
if otherRow is not None and otherRow[targetField].shape[0]>0:
try:
tm1InceptionRealizedPnl = otherRow[targetField].iloc[0]
val = todaysPnl - tm1InceptionRealizedPnl
except:
# slow...
errMsg = "Failed lookup for " + str(cob) + " " + targetField
logging.error(errMsg)
val = np.nan
return val
@staticmethod
def computeSharpeRatio(pdPnl, cob):
val = None
pdPnl = pdPnl[(pdPnl['COB']<=cob)]
pdPnl = pdPnl.loc[:,["COB", "DTDRealizedPnl","DTDUnrealizedPnl"]]
pdPnl["TotalDTD"] = pdPnl.apply(lambda rw : rw["DTDRealizedPnl"] + rw["DTDUnrealizedPnl"], axis=1)
# @todo, We don't have risk free rate for Sharpe Ration calc. Here's just total DTD avg return over standard deviation
# https://en.wikipedia.org/wiki/Sharpe_ratio
mean = pdPnl["TotalDTD"].mean()
std = pdPnl["TotalDTD"].std()
val = mean / std
return val
@staticmethod
def computeMaxDrawDown(pdPnl, cob):
val = None
pdPnl = pdPnl[(pdPnl['COB']<=cob) & (pdPnl["DTDRealizedPnl"]<0)]
val = pdPnl["DTDRealizedPnl"].min()
return val
答案 1 :(得分:0)
这假设>>> from app.models import User
>>> User
的间隔为15分钟。您可以created
交易和交换,并向下移动2个(持续2个周期,因为每个周期为15分钟):
groupby
输出:
dfSummary['30min_ago_spread'] = dfSummary.groupby(['symbol', 'exchange'])['spread'].shift(2)
答案 2 :(得分:0)
矢量化!!!!!! (嗯...大部分)
想法是,对
从原始修订扩展...
STEP 1) ProfitLoss.py \ to_dict 可以预先计算TM1,MonthStart,QuarterStart,YearStart-因为无论如何都会调用它。
import datetime
import time
import math
from Util import ObjectUtil
from Util import DateTimeUtil
import pandas as pd
import numpy as np
from Util import ObjectUtil
class ProfitLoss(object):
def set(self, field, val):
setattr(self, field, val)
def to_dict(self):
result = ObjectUtil.objectPropertiesToDictionary(self)
result["TM1"] = self.COB - datetime.timedelta(days=1)
result["MonthStart"] = self.COB.replace(day=1)
result["QuarterStart"] = DateTimeUtil.getQuarterStart(self.COB, self.COB.year)
result["YearStart"] = datetime.datetime(self.COB.year, 1, 1)
return result
@staticmethod
def from_dict(dict):
if dict is None:
return None
profitLosses = []
for k, v in dict.items():
numPnl = len(v)
for i in range(0, numPnl):
pnl = ProfitLoss()
profitLosses.append(pnl)
break
for k, v in dict.items():
if k == "from_dict":
break
i = 0
for val in v.values():
if isinstance(val, pd.Timestamp):
val = datetime.datetime(val.year, val.month, val.day)
val = None if val == np.nan else val
if isinstance(val, float) and math.isnan(val):
val = None
profitLosses[i].set(k, val)
i+=1
return profitLosses
STEP 2)(而不是DataFrame)进行合并(即自动联接)。应用或DataFrame。查找:
pdPnl = pd.DataFrame.from_records([pnl.to_dict() for pnl in profitLosses])
pdPnl = pdPnl.merge(pdPnl, how='inner', left_on=["TM1"], right_on=["COB"], suffixes = ('','_tm1'))
pdPnl = pdPnl.merge(pdPnl, how='inner', left_on=["MonthStart"], right_on=["COB"], suffixes = ('','_MonthStart'))
pdPnl = pdPnl.merge(pdPnl, how='inner', left_on=["QuarterStart"], right_on=["COB"], suffixes = ('','_QuaterStart'))
pdPnl = pdPnl.merge(pdPnl, how='inner', left_on=["YearStart"], right_on=["COB"], suffixes = ('','_YearStart'))
# Vectorized
pdPnl["DTDRealizedPnl"] = pdPnl["InceptionRealizedPnl"] - pdPnl["InceptionRealizedPnl_tm1"]
pdPnl["DTDUnrealizedPnl"] = pdPnl["InceptionUnrealizedPnl"] - pdPnl["InceptionUnrealizedPnl_tm1"]
pdPnl["MTDRealizedPnl"] = pdPnl["InceptionRealizedPnl"] - pdPnl["InceptionRealizedPnl_MonthStart"]
pdPnl["MTDUnrealizedPnl"] = pdPnl["InceptionUnrealizedPnl"] - pdPnl["InceptionUnrealizedPnl_MonthStart"]
pdPnl["YTDRealizedPnl"] = pdPnl["InceptionRealizedPnl"] - pdPnl["InceptionRealizedPnl_YearStart"]
pdPnl["YTDUnrealizedPnl"] = pdPnl["InceptionUnrealizedPnl"] - pdPnl["InceptionUnrealizedPnl_YearStart"]
# Not yet vectorized
pdPnl["SharpeRatio"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeSharpeRatio(pdPnl, rw["COB"]), axis=1)
pdPnl["MaxDrawDown"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeMaxDrawDown(pdPnl, rw["COB"]), axis=1)
pnlDict = pdPnl.to_dict()
updatedProfitLosses = ProfitLoss.ProfitLoss.from_dict(pnlDict)
实际上,我不确定合并/自连接是否比显式循环更有效。另外,我仍然不知道该怎么做锐化比和 MaxDrawdown !熊猫的窗口功能似乎无济于事...
人?!谢谢!