我有以下函数,可将csv加载到数据帧中,然后进行一些计算。用略多于100,000行的csv进行计算大约需要4-5分钟。我希望有一种更快的方法。
def calculate_adeck_errors(in_file):
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")}')
pd.set_option('display.max_columns', 12)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
#print(adeck_df)
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
#print(carq_data)
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
#print(final_df)
row_list = []
for index, row in carq_data.iterrows():
position_time = row['POSDATETIME']
for index, arow in final_df.iterrows():
if arow['POSDATETIME'] == position_time:
# match, so do calculations
storm_id = arow['STORMID']
model_base_time = arow['MODELDATETIME']
the_hour = arow['TAU']
the_model = arow['MODEL']
point1 = float(row['LAT']), float(row['LON'])
point2 = float(arow['LAT']), float(arow['LON'])
if arow['LAT'] == 0.0:
dist_error = None
else:
dist_error = int(round(haversine(point1, point2, miles=True)))
if arow['WIND'] != 0:
wind_error = int(abs(int(row['WIND']) - int(arow['WIND'])))
else: wind_error = None
if arow['PRES'] != 0:
pressure_error = int(abs(int(row['PRES']) - int(arow['PRES'])))
else:
pressure_error = None
lat_carq = row['LAT']
lon_carq = row['LON']
lat_model = arow['LAT']
lon_model = arow['LON']
wind_carq = row['WIND']
wind_model = arow['WIND']
pres_carq = row['PRES']
pres_model = arow['PRES']
row_list.append([storm_id, model_base_time, the_model, the_hour, lat_carq, lon_carq, lat_model, lon_model, dist_error,
wind_carq, wind_model, wind_error, pres_carq, pres_model, pressure_error])
result_df = pd.DataFrame(row_list)
result_df = result_df.where((pd.notnull(result_df)), None)
result_cols = ['StormID', 'ModelBasetime', 'Model' , 'Tau',
'LatCARQ', 'LonCARQ', 'LatModel', 'LonModel', 'DistError',
'WindCARQ', 'WindModel','WindError',
'PresCARQ', 'PresModel','PresError']
result_df.columns = result_cols
calculate_adeck_errors(infile)
要弄清楚我在做什么: 1. CARQ条目是控件(实际)。 2.其他模型是猜测。 3.我正在将控件(CARQ)与猜测进行比较,以查看其错误所在。 4.比较的基础是MODELBASETIME = POSBASETIME 4.我正在处理的示例文件在这里:http://vortexweather.com/downloads/adeck/aal062018.csv
我希望有比我更快的方法,或者除了迭代之外还有另一种熊猫方法
非常感谢您的建议。 布莱恩
答案 0 :(得分:0)
好像您要从同一数据帧中创建两个数据帧,然后对其进行处理。两件事可能会减少您的时间。
首先,您要遍历两个数据框并检查条件:
for _, row in carq_data.iterrows():
for _, arow in final_df.iterrows():
if arow['POSDATETIME'] == row['POSDATETIME']:
# do something by using both tables
这本质上是联接的实现。您正在carq_data
上将final_df
和'POSDATETIME'
加入。
第一步,您应该合并表:
merged = carq_data.merge(final_df, on=['POSDATETIME'])
这时,您将为每个相似的'POSDATETIME'
获得多行。在下面,假设列b
是POSDATETIME
:
>>> a
a b
0 1 11
1 1 33
>>> b
a b
0 1 2
1 1 3
2 1 4
>>> merged = a.merge(b, on=['a'])
>>> merged
a b_x b_y
0 1 11 2
1 1 11 3
2 1 11 4
3 1 33 2
4 1 33 3
5 1 33 4
现在,要进行条件计算,可以使用apply()
函数。
首先,定义一个函数:
def calc_dist_error(row):
return int(round(haversine(row['b_x'], row['b_y'], miles=True))) if row['a'] != 0.0 else None
然后将其应用于每一行:
merged['dist_error'] = merged.apply(calc_dist_error, axis=1)
继续我的小例子:
>>> merged['c'] = [1, 0, 0, 0, 2, 3]
>>> merged
a b_x b_y c
0 1 11 2 1
1 1 11 3 0
2 1 11 4 0
3 1 33 2 0
4 1 33 3 2
5 1 33 4 3
>>> def foo(row):
... return row['b_x'] - row['b_y'] if row['c'] != 0 else None
...
>>> merged['dist_error'] = merged.apply(foo, axis=1)
>>> merged
a b_x b_y c dist_error
0 1 11 2 1 9.0
1 1 11 3 0 NaN
2 1 11 4 0 NaN
3 1 33 2 0 NaN
4 1 33 3 2 30.0
5 1 33 4 3 29.0
这应该有助于您减少运行时间(另请参见this,以了解如何使用%timeit
进行检查)。希望这可以帮助!
答案 1 :(得分:0)
此代码大约需要10秒钟才能运行整个数据集!
该代码看起来与您编写的代码非常相似,不同之处在于main_function中的所有操作均已向量化。参见Fast, Flexible, Easy and Intuitive: How to Speed Up Your Pandas Projects
2018-09-13_adeck_error_calculations.ipynb
import pandas as pd
import numpy as np
import datetime
from haversine import haversine
def main_function(df, row):
"""
The main difference here is that everything is vectorized
Returns: DataFrame
"""
df_new = pd.DataFrame()
df_storage = pd.DataFrame()
pos_datetime = df.POSDATETIME.isin([row['POSDATETIME']]) # creates a Boolean map
array_len = len(pos_datetime)
new_index = pos_datetime.index
df_new['StormID'] = df.loc[pos_datetime, 'STORMID']
df_new['ModelBaseTime'] = df.loc[pos_datetime, 'MODELDATETIME']
df_new['Model'] = df.loc[pos_datetime, 'MODEL']
df_new['Tau'] = df.loc[pos_datetime, 'TAU']
# Distance
df_new['LatCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LAT']), index=new_index).loc[pos_datetime, 0]
df_new['LonCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LON']), index=new_index).loc[pos_datetime, 0]
df_new['LatModel'] = df.loc[pos_datetime, 'LAT']
df_new['LonModel'] = df.loc[pos_datetime, 'LON']
def calc_dist_error(row):
return round(haversine((row['LatCARQ'], row['LonCARQ']), (row['LatModel'], row['LonModel']), miles=True)) if row['LatModel'] != 0.0 else None
df_new['DistError'] = df_new.apply(calc_dist_error, axis=1)
# Wind
df_new['WindCARQ'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_new['WindModel'] = df.loc[pos_datetime, 'WIND']
df_storage['row_WIND'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_storage['df_WIND'] = df.loc[pos_datetime, 'WIND']
def wind_error_calc(row):
return (row['row_WIND'] - row['df_WIND']) if row['df_WIND'] != 0 else None
df_new['WindError'] = df_storage.apply(wind_error_calc, axis=1)
# Air Pressure
df_new['PresCARQ'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_new['PresModel'] = df.loc[pos_datetime, 'PRES']
df_storage['row_PRES'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_storage['df_PRES'] = df.loc[pos_datetime, 'PRES']
def pres_error_calc(row):
return abs(row['row_PRES'] - row['df_PRES']) if row['df_PRES'] != 0 else None
df_new['PresError'] = df_storage.apply(pres_error_calc, axis=1)
del(df_storage)
return df_new
def calculate_adeck_errors(in_file):
"""
Retruns: DataFrame
"""
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 300)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
adeck_df['MODELDATETIME'] = pd.to_datetime(adeck_df['MODELDATETIME'], format='%Y-%m-%d %H:%M')
adeck_df['POSDATETIME'] = pd.to_datetime(adeck_df['POSDATETIME'], format='%Y-%m-%d %H:%M')
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
print('Len carq_data: ', len(carq_data))
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
print('Len final_df: ', len(final_df))
df_out_new = pd.DataFrame()
for index, row in carq_data.iterrows():
test_df = main_function(final_df, row) # function call
df_out_new = df_out_new.append(test_df, sort=False)
df_out_new = df_out_new.reset_index(drop=True)
df_out_new = df_out_new.where((pd.notnull(df_out_new)), None)
print(f'Finishing Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
return df_out_new
in_file = 'aal062018.csv'
df = calculate_adeck_errors(in_file)
>>>Starting Data Calculations: 02:18:30AM on September 13, 2018
>>>Len carq_data: 56
>>>Len final_df: 137999
>>>Finishing Data Calculations: 02:18:39AM on September 13, 2018
print(len(df))
>>>95630
print(df.head(20))
请不要忘记检查接受的解决方案。享受吧!