有没有办法为pandas实现类似scalar函数的sql? 输入将取决于行以及数据帧的列。
import pandas as pd
import numpy as np
#################### this is the data ####################################################
volume_curve = pd.DataFrame(
{'SecurityName': {0: 'GOOGL', 1: 'GOOGL', 2: 'GOOGL', 3: 'GOOGL', 4: 'GOOGL', 5: 'GOOGL', 6: 'GOOGL',
7: 'GOOGL', 8: 'GOOGL', 9: 'GOOGL', 10: 'GOOGL', 11: 'GOOGL', 12: 'GOOGL', 13: 'GOOGL',
14: 'GOOGL', 15: 'AAPL', 16: 'AAPL', 17: 'AAPL', 18: 'AAPL', 19: 'AAPL', 20: 'AAPL', 21: 'AAPL',
22: 'AAPL', 23: 'AAPL', 24: 'AAPL', 25: 'AAPL', 26: 'AAPL', 27: 'AAPL', 28: 'AAPL', 29: 'AAPL'},
'RemainingVolume': {0: 100, 1: 80, 2: 75, 3: 70, 4: 65, 5: 60, 6: 58, 7: 55, 8: 53, 9: 50, 10: 40, 11: 30, 12: 25,
13: 20, 14: 0, 15: 100, 16: 90, 17: 80, 18: 70, 19: 60, 20: 50, 21: 45, 22: 40, 23: 35, 24: 30,
25: 25, 26: 20, 27: 15, 28: 10, 29: 0},
'TradingTimeEST': {0: '3/22/2016 9:00:00', 1: '3/22/2016 9:30:00', 2: '3/22/2016 10:00:00', 3: '3/22/2016 10:30:00',
4: '3/22/2016 11:00:00', 5: '3/22/2016 11:30:00', 6: '3/22/2016 12:00:00', 7: '3/22/2016 12:30:00',
8: '3/22/2016 13:00:00', 9: '3/22/2016 13:30:00', 10: '3/22/2016 14:00:00', 11: '3/22/2016 14:30:00',
12: '3/22/2016 15:00:00', 13: '3/22/2016 15:30:00', 14: '3/22/2016 16:00:00', 15: '3/22/2016 9:00:00',
16: '3/22/2016 9:30:00', 17: '3/22/2016 10:00:00', 18: '3/22/2016 10:30:00', 19: '3/22/2016 11:00:00',
20: '3/22/2016 11:30:00', 21: '3/22/2016 12:00:00', 22: '3/22/2016 12:30:00', 23: '3/22/2016 13:00:00',
24: '3/22/2016 13:30:00', 25: '3/22/2016 14:00:00', 26: '3/22/2016 14:30:00', 27: '3/22/2016 15:00:00',
28: '3/22/2016 15:30:00', 29: '3/22/2016 16:00:00'}}
)
trading_time = pd.DataFrame(
{'SecurityName': {0: 'GOOGL', 1: 'GOOGL', 2: 'APPL', 3: 'APPL'},
'EndTradingTimeEST': {0: '3/22/2016 11:30:00', 1: '3/22/2016 3:30:00', 2: '3/22/2016 11:30:00', 3: '3/22/2016 3:30:00'},
'StartTradingTimeEST': {0: '3/22/2016 9:15:00', 1: '3/22/2016 1:30:00', 2: '3/22/2016 9:15:00', 3: '3/22/2016 1:30:00'}}
)
volume_curve['TradingTimeEST'] = pd.to_datetime(volume_curve['TradingTimeEST'])
trading_time['StartTradingTimeEST'] = pd.to_datetime(trading_time['StartTradingTimeEST'])
trading_time['EndTradingTimeEST'] = pd.to_datetime(trading_time['EndTradingTimeEST'])
#################### this is the data ####################################################
############################### a sql like scalar function that returns a single value #################
def get_available_volume(start, end, security_name, volume_info):
"""
interpolate the volume of start trading and end trading time based on the volume information.
and the difference is the available volume
"""
start = start[0]
end = end[0]
security_name = security_name[0]
stage = volume_info[volume_info['SecurityName'] == security_name]
start_n_time = stage[stage['TradingTimeEST'] >= start]['TradingTimeEST'].min()
start_n_value = stage[stage['TradingTimeEST'] == start_n_time]['RemainingVolume'].values[0]
start_p_time = stage[stage['TradingTimeEST'] <= start]['TradingTimeEST'].max()
start_p_value = stage[stage['TradingTimeEST'] == start_p_time]['RemainingVolume'].values[0]
if start == start_n_time:
start_interpolate_value = start_n_value
else:
start_interpolate_value = start_p_value-(start_p_value-start_n_value)*(start_n_time - start)/(start_n_time-start_p_time)
end_n_time = stage[stage['TradingTimeEST'] >= end]['TradingTimeEST'].min()
end_n_value = stage[stage['TradingTimeEST'] == end_n_time]['RemainingVolume'].values[0]
end_p_time = stage[stage['TradingTimeEST'] <= end]['TradingTimeEST'].max()
end_p_value = stage[stage['TradingTimeEST'] == end_p_time]['RemainingVolume'].values[0]
if end == end_n_time:
end_interpolate_value = end_n_value
else:
end_interpolate_value = end_p_value-(end_p_value-end_n_value)*(end_n_time - end)/(end_n_time-end_p_time)
# return the difference of start time volume and end time volume
return start_interpolate_value - end_interpolate_value
################################################# end of this function ############
print get_available_volume(
start = [pd.to_datetime('2016-3-22 9:50')],
end = [pd.to_datetime('2016-3-22 11:15')],
security_name = ['GOOGL'],
volume_info = volume_curve
)
trading_time['AvailableVolume'] = trading_time.apply(
lambda x: get_available_volume(
start = trading_time['StartTradingTimeEST'],
end = trading_time['EndTradingTimeEST'],
security_name = trading_time['SecurityName'],
volume_info = volume_curve
),
axis = 1
)
编辑: 修复:{当我尝试应用该函数时,我得到错误说('长度必须匹配才能比较','u'Ccurred at index 0')}
当前错误:它的值不正确