我已经阅读了解决此问题的方法,例如; Vectorisation和Numba,但认为我没有足够的Python来真正理解和利用它们。
事实上,除了我对np.where进行的测试之外,我对两者的尝试均失败了,可能是错误的实现。这显示了我的for循环/ calc有多糟糕。
import pandas as pd
import numpy as np
import datetime as date
import itertools
def points(row):
val = 0
if row['Ob2'] > 0.5:
foo = row['Ob3'] - row['Ob1']
if foo < 0.1:
val = 1 - foo
val = 0
return val
print("Start: "+ str(date.datetime.now()))
player_list = ['player' + str(x) for x in range(1,71)]
data = pd.DataFrame({'Names': player_list*1000,\
'Ob1' : np.random.rand(70000),\
'Ob2' : np.random.rand(70000) ,\
'Ob3' : np.random.rand(70000)})
#create list of unique pairs
comboNames = list(itertools.combinations(data.Names.unique(), 2))
#create a data frame dictionary to store your data frames
DataFrameDict = {elem : pd.DataFrame for elem in comboNames}
for key in DataFrameDict.keys():
DataFrameDict[key] = data[:][data.Names.isin(key)]
DataFrameDict[key] = DataFrameDict[key].sort_values(['Ob1'])
print("DF fill: "+ str(date.datetime.now()))
#Add test calculated column
for tbl in DataFrameDict:
DataFrameDict[tbl]['Test'] = DataFrameDict[tbl].apply(points, axis=1) #Slow loop
#example vectorised, hugh dif is run time
#DataFrameDict[tbl]['Test'] = np.where((DataFrameDict[tbl]['Ob2']>0.5),1,0)
print("Calc'd: "+ str(date.datetime.now()))
headers = ['Player1','Player2','Score','Count']
summary = pd.DataFrame(([tbl[0], tbl[1], DataFrameDict[tbl]['Test'].sum(),
DataFrameDict[tbl]['Test'].astype(bool).sum(axis=0)] for tbl in DataFrameDict),
columns=headers).sort_values(['Score'], ascending=[False])
print("Fin: "+ str(date.datetime.now()))
EDIT2 :更好地表示现实问题
import pandas as pd
import numpy as np
import datetime as date
import itertools
def random_dates(start, end, n, unit='D', seed=None):
if not seed:
ndays = (end - start).days + 1
return pd.to_timedelta(np.random.rand(n) * ndays, unit=unit) + start
def points(row):
val = 0
if row['Names'] != row['Names2']:
secs = row['Dates'] - row['Dates2']
secs = secs.total_seconds()
if secs in range(1, 301):
val = 301 - secs
val = 0
return val
print("Start: "+ str(date.datetime.now()))
player_list = ['player' + str(x) for x in range(1,71)]
start = pd.to_datetime('2019-04-01')
end = pd.to_datetime('2019-04-10')
data = pd.DataFrame({'Names': player_list*1000,
'Dates': random_dates(start, end, 70000)})
#create list of unique pairs
comboNames = list(itertools.combinations(data.Names.unique(), 2))
#create a data frame dictionary to store your data frames
DataFrameDict = {elem : pd.DataFrame for elem in comboNames}
for key in DataFrameDict.keys():
DataFrameDict[key] = data[:][data.Names.isin(key)]
DataFrameDict[key] = DataFrameDict[key].sort_values(['Dates'])
DataFrameDict[key]['Names2'] = DataFrameDict[key]['Names'].shift(1)
DataFrameDict[key]['Dates2'] = DataFrameDict[key]['Dates'].shift(1)
print("DF fill: "+ str(date.datetime.now()))
#Add test calculated column
for tbl in DataFrameDict:
DataFrameDict[tbl]['Test'] = DataFrameDict[tbl].apply(points, axis=1) #Slow loop
#example vectorised, hugh dif is run time
#DataFrameDict[tbl]['Test'] = np.where((DataFrameDict[tbl]['Ob2']>0.5),1,0)
print("Calc'd: "+ str(date.datetime.now()))
headers = ['Player1','Player2','Score','Count']
summary = pd.DataFrame(([tbl[0], tbl[1], DataFrameDict[tbl]['Test'].sum(),
DataFrameDict[tbl]['Test'].astype(bool).sum(axis=0)] for tbl in DataFrameDict),
columns=headers).sort_values(['Score'], ascending=[False])
print("Fin: "+ str(date.datetime.now()))
我的 Solution ,由于混乱而不想在此处发布。
答案 0 :(得分:2)
import pandas as pd
import numpy as np
import datetime as date
import itertools
player_list = ['player' + str(x) for x in range(1,71)]
data = pd.DataFrame({'Names': player_list*1000,\
'Ob1' : np.random.rand(70000),\
'Ob2' : np.random.rand(70000) ,\
'Ob3' : np.random.rand(70000)})
data['Test'] = np.where(data['Ob2'] > 0.5, np.where(data['Ob3'] - data['Ob1'] < 0.1, 1 - (data['Ob3'] - data['Ob1']), 0), 0)
comboNames = list(itertools.combinations(data.Names.unique(), 2))
DataFrameDict = {elem : pd.DataFrame for elem in comboNames}
for key in DataFrameDict.keys():
DataFrameDict[key] = data[:][data.Names.isin(key)]
DataFrameDict[key] = DataFrameDict[key].sort_values(['Ob1'])
headers = ['Player1','Player2','Score','Count']
summary = pd.DataFrame(([tbl[0], tbl[1], DataFrameDict[tbl]['Test'].sum(),
DataFrameDict[tbl]['Test'].astype(bool).sum(axis=0)] for tbl in DataFrameDict),
columns=headers).sort_values(['Score'], ascending=[False])
时,每个循环26.2 s±1.15 s(平均±标准偏差,共运行7次,每个循环1次)
player_list = ['player' + str(x) for x in range(1,71)]
data = pd.DataFrame({'Names': player_list*1000,\
'Ob1' : np.random.rand(70000),\
'Ob2' : np.random.rand(70000) ,\
'Ob3' : np.random.rand(70000)})
# Calculating the individual total test score for each row in data
data['test'] = np.where(data['Ob2'] > 0.5, np.where(data['Ob3'] - data['Ob1'] < 0.1, 1 - (data['Ob3'] - data['Ob1']), 0), 0)
# The goal of this function is to get the sum, and count of the test score for each player
def ScoreAndCount(row):
score = row.sum()
count = row.astype(bool).sum()
return score, count
# Applying the function above, I group by each player and
# get the total sum of test and the total count for each player.
df = data.groupby('Names')['test'].apply(ScoreAndCount).reset_index()
df = pd.concat([df['Names'], df.test.apply(pd.Series).rename(columns = {0: 'Score', 1:'Count'})], axis = 1)
# Using itertools I create a dataframe Summary that has two columns covering
# every single matchup between player, and label the columns Player1 and Player2
summary = pd.DataFrame(itertools.combinations(data.Names.unique(), 2), columns = ['Player1', 'Player2'])
# Below ,I merge summary with my dataframe that contains the total score and count
# for each player. Every single time there is a player1 in the Player1 column it
# will merge the their total score and count, the same is then done for the
# players in the Player2 column. After these merges I have 6 columns. The two
# player columns, and the total scores and counts for both individuals.
summary = summary.merge(df, left_on = 'Player1', right_on = 'Names')\
.merge(df, left_on = 'Player2', right_on = 'Names')\
.drop(columns = ['Names_x', 'Names_y'])
# Below, I add the players 'Score' and 'Count' columns to get the total score
# and total count per iteration. Then I clean the df dropping the columns that
# are not needed, and sorting by score.
summary['Score'] = summary['Score_x'] + summary['Score_y']
summary['Count'] = summary['Count_x'] + summary['Count_y']
summary.drop(columns = ['Score_x','Count_x', 'Score_y','Count_y'], inplace = True)
summary.sort_values('Score', ascending = False)
157 ms ± 1.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
我的函数ScoreAndCount返回每个玩家的得分和计数。 pd.concat将返回该函数并将其添加到我们的初始df中。
player_list = ['player' + str(x) for x in range(1,71)]
data = pd.DataFrame({'Names': player_list*10,\
'Ob1' : np.random.rand(700),\
'Ob2' : np.random.rand(700) ,\
'Ob3' : np.random.rand(700)})
Names Ob1 Ob2 Ob3
0 player1 0.548814 0.373216 0.313591
1 player2 0.715189 0.222864 0.365539
2 player3 0.602763 0.080532 0.201267
3 player4 0.544883 0.085311 0.487148
4 player5 0.423655 0.221396 0.990369
def points(row):
val = 0
if row['Ob2'] > 0.5:
foo = row['Ob3'] - row['Ob1']
if foo < 0.1:
val = 1 - foo
val = 0
return val
#create list of unique pairs
comboNames = list(itertools.combinations(data.Names.unique(), 2))
DataFrameDict = {elem : pd.DataFrame for elem in comboNames}
for key in DataFrameDict.keys():
DataFrameDict[key] = data[:][data.Names.isin(key)]
DataFrameDict[key] = DataFrameDict[key].sort_values(['Ob1'])
#Add test calculated column
for tbl in DataFrameDict:
DataFrameDict[tbl]['Test'] = DataFrameDict[tbl].apply(points, axis=1)
DataFrameDict[('player1', 'player2')].head()
Names Ob1 Ob2 Ob3 Test
351 player2 0.035362 0.013509 0.384273 0.0
630 player1 0.062636 0.305047 0.571550 0.0
561 player2 0.133461 0.758194 0.964210 0.0
211 player2 0.216897 0.056877 0.417333 0.0
631 player2 0.241902 0.557987 0.983555 0.0
DataFrameDict[('player1', 'player2')]['Test'].sum()
data['test'] = np.where(data['Ob2'] > 0.5, np.where(data['Ob3'] - data['Ob1'] < 0.1, 1 - (data['Ob3'] - data['Ob1']), 0), 0)
def ScoreAndCount(row):
score = row.sum()
count = row.astype(bool).sum()
return score, count
df = data.groupby('Names')['test'].apply(ScoreAndCount).reset_index()
df = pd.concat([df['Names'], df.test.apply(pd.Series).rename(columns = {0: 'Score', 1:'Count'})], axis = 1)
summary = pd.DataFrame(itertools.combinations(data.Names.unique(), 2), columns = ['Player1', 'Player2'])
summary = summary.merge(df, left_on = 'Player1', right_on = 'Names')\
.merge(df, left_on = 'Player2', right_on = 'Names')\
.drop(columns = ['Names_x', 'Names_y'])
summary['Score'] = summary['Score_x'] + summary['Score_y']
summary['Count'] = summary['Count_x'] + summary['Count_y']
summary.drop(columns = ['Score_x','Count_x', 'Score_y','Count_y'], inplace = True)
summary = summary.sort_values('Score', ascending = False)
summary[(summary['Player1'] == 'player1')&(summary['Player2'] == 'player2')]
Player1 Player2 Score Count
0 player1 player2 8.077455 6.0
答案 1 :(得分:1)
我能够使用numba将您的函数向量化,并且生成的代码在%8%timeit的情况下在大约8秒内运行。我遵循Ben Pap的建议,并预先计算了测试列。我还预先对值进行了排序,并整理了DataFrameDict的创建。
import pandas as pd
import numpy as np
import datetime as date
import itertools
import numba
def points(a,b,c):
val = 0
if b > 0.5:
foo = c - a
if foo < 0.1:
val = 1 - foo
val = 0
return val
player_list = ['player' + str(x) for x in range(1,71)]
data = pd.DataFrame({'Names': player_list*1000,\
'Ob1' : np.random.rand(70000),\
'Ob2' : np.random.rand(70000) ,\
'Ob3' : np.random.rand(70000)})
data['Test'] = points(data['Ob1'].values,data['Ob2'].values,data['Ob3'].values)
data = data.sort_values(['Ob1'])
comboNames = list(itertools.combinations(data.Names.unique(), 2))
DataFrameDict = {elem : data.loc[data.Names.isin(elem)] for elem in comboNames}
headers = ['Player1','Player2','Score','Count']
summary = pd.DataFrame(([tbl[0], tbl[1], DataFrameDict[tbl]['Test'].sum(),
DataFrameDict[tbl]['Test'].astype(bool).sum(axis=0)] for tbl in DataFrameDict),
columns=headers).sort_values(['Score'], ascending=[False])
每个循环8.52 s±204 ms(平均±标准偏差,共运行7次,每个循环1次)
答案 2 :(得分:1)
(a_df['Ob2'] > 0.5) & (a_df['Ob3'] - a_df['Ob1'] < 0.01)
在这种情况下,将值1 - x['Ob3'] + x['Ob1']
的列值更新回a_df ['Test']并将其分配回DataFrameDict
for tbl in DataFrameDict:
a_df = DataFrameDict[tbl].assign(Test=0)
a_df['Test'].update(a_df[(a_df['Ob2'] > 0.5) & (a_df['Ob3'] - a_df['Ob1'] < 0.01)].assign(Test=lambda x: 1 - x['Ob3'] + x['Ob1'])['Test'])
DataFrameDict[tbl] = a_df
In [1288]: DataFrameDict[('player65', 'player67')]
Names Ob1 Ob2 Ob3 Test
61456 player67 0.000271 0.686051 0.729086 0.000000
25824 player65 0.001281 0.505552 0.296550 0.000000
25544 player65 0.001398 0.770805 0.471477 0.000000
65864 player65 0.001999 0.147407 0.291841 0.000000
33104 player65 0.002661 0.254329 0.126290 0.000000
42554 player65 0.003172 0.529603 0.181796 0.000000
28064 player65 0.003663 0.227429 0.558233 0.000000
24844 player65 0.005517 0.096817 0.710771 0.000000
2584 player65 0.005974 0.338904 0.582034 0.000000
42694 player65 0.005996 0.171637 0.765277 0.000000
6154 player65 0.006126 0.181239 0.295149 0.000000
65234 player65 0.008386 0.180613 0.994273 0.000000
5034 player65 0.008921 0.013060 0.305063 0.000000
21766 player67 0.010950 0.590966 0.481547 0.000000
53054 player65 0.010957 0.731794 0.262754 0.000000
15956 player67 0.010996 0.046718 0.153172 0.000000
36046 player67 0.011634 0.250039 0.064184 0.000000
50394 player65 0.011835 0.995986 0.834281 0.000000
64326 player67 0.011974 0.499262 0.745194 0.000000
30236 player67 0.013029 0.101714 0.143509 0.000000
23374 player65 0.014865 0.158185 0.575582 0.000000
1256 player67 0.014915 0.938301 0.629850 0.000000
10216 player67 0.015122 0.450750 0.137085 0.000000
21904 player65 0.016372 0.147897 0.786882 0.000000
34854 player65 0.016603 0.513692 0.676243 0.000000
33806 player67 0.016820 0.063896 0.577731 0.000000
29816 player67 0.017565 0.060496 0.151780 0.000000
6924 player65 0.017652 0.121581 0.117512 0.000000
39126 player67 0.017990 0.516819 0.663672 0.000000
39896 player67 0.018085 0.031526 0.075832 0.000000
... ... ... ... ... ...
61526 player67 0.985386 0.512073 0.754241 1.231145
48926 player67 0.985504 0.007080 0.671456 0.000000
16234 player65 0.985775 0.846647 0.998181 0.000000
12736 player67 0.985846 0.283997 0.667314 0.000000
47874 player65 0.986084 0.052026 0.508918 0.000000
29886 player67 0.986655 0.998440 0.068136 1.918518
49416 player67 0.986706 0.833053 0.182814 1.803892
42486 player67 0.986797 0.608128 0.136219 1.850578
55644 player65 0.987796 0.215898 0.561002 0.000000
1814 player65 0.987935 0.324954 0.525433 0.000000
7554 player65 0.988910 0.664914 0.674546 1.314365
59774 player65 0.989147 0.235214 0.913588 0.000000
58444 player65 0.989467 0.645191 0.533468 1.455999
62856 player67 0.989470 0.523544 0.302838 1.686632
48646 player67 0.990588 0.522521 0.201132 1.789456
11336 player67 0.990629 0.932360 0.756544 1.234085
31774 player65 0.990881 0.981641 0.943824 1.047057
18964 player65 0.992287 0.808989 0.948321 1.043967
14486 player67 0.992909 0.437701 0.484678 0.000000
12246 player67 0.994027 0.542903 0.234830 1.759197
33596 player67 0.994257 0.949055 0.098368 1.895889
6436 player67 0.994661 0.444211 0.572136 0.000000
4194 player65 0.995022 0.721113 0.584195 1.410826
42696 player67 0.995065 0.516103 0.918737 1.076328
51026 player67 0.995864 0.877335 0.516737 1.479127
14136 player67 0.997691 0.134021 0.913969 0.000000
47664 player65 0.998051 0.628051 0.722695 1.275357
55924 player65 0.998079 0.828749 0.151217 1.846863
18474 player65 0.998780 0.200990 0.098713 0.000000
41296 player67 0.998884 0.167139 0.504899 0.000000
[2000 rows x 5 columns]
答案 3 :(得分:0)
def points(row):
val = 0
if row['Ob2'] > 0.5:
foo = row['Ob3'] - row['Ob1']
if foo < 0.1:
val = 1 - foo
val = 0
return val
答案 4 :(得分:0)
import pandas as pd
import numpy as np
import datetime as date
import itertools
def random_dates(start, end, n, unit='D', seed=None):
if not seed:
ndays = (end - start).days + 1
return pd.to_timedelta(np.random.rand(n) * ndays, unit=unit) + start
print("Start: "+ str(date.datetime.now()))
player_list = ['player' + str(x) for x in range(1,71)]
start = pd.to_datetime('2019-04-01')
end = pd.to_datetime('2019-04-10')
data = pd.DataFrame({'Names': player_list*1000,
'Dates': random_dates(start, end, 70000)})
#create list of unique pairs
comboNames = list(itertools.combinations(data.Names.unique(), 2))
#create a data frame dictionary to store your data frames
DataFrameDict = {elem : pd.DataFrame for elem in comboNames}
for key in DataFrameDict.keys():
DataFrameDict[key] = data[:][data.Names.isin(key)]
DataFrameDict[key] = DataFrameDict[key].sort_values(['Dates'])
seconds = (DataFrameDict[key]['Dates'] - DataFrameDict[key]['Dates'].shift(1))/ np.timedelta64(1,'s')
DataFrameDict[key]['Test'] = np.where((DataFrameDict[key]['Names'] != DataFrameDict[key]['Names'].shift(1))&\
(np.logical_and(seconds>=1, seconds<=301)), 301-seconds,0).astype(np.uint8)
print("DF fill: "+ str(date.datetime.now()))
headers = ['Player1','Player2','Score','Count']
summary = pd.DataFrame(([tbl[0], tbl[1], DataFrameDict[tbl]['Test'].sum(),
DataFrameDict[tbl]['Test'].astype(bool).sum(axis=0)] for tbl in DataFrameDict),
columns=headers).sort_values(['Score'], ascending=[False])
print("Fin: "+ str(date.datetime.now()))