我有一个400万行的csv。对于每一行,我采用前面的所有行,我根据当前行的多个条件对它们进行过滤,并创建一些统计信息。总共需要花费太多时间,所以我试图找到一种加速它的方法。所以我介于以下几个方面:
以下是我的实际代码,使用如下所述的for循环:
# -*- coding: utf-8 -*-
import csv
import numpy as np
import time
import itertools
# Functions
def get_shmeio_stats(data, reference_list):
shmeio_stats = '-'
if data:
tally = (data.count(i) for i in reference_list)
shmeio_stats = '-'.join(map(str, tally))
else:
shmeio_stats ='-'
if shmeio_stats == '0-0-0':
shmeio_stats ='-'
return shmeio_stats
bet365_data_stats = []
shmeia_list = ['1', 'x', '2']
with open('BET365.csv', 'rb') as f:
bet365_data = csv.reader(f)
bet365_matches = list(bet365_data)[1:]
start = time.time()
for index, each_match in enumerate(bet365_matches):
print index,
start_each = time.time()
id = index
# print index
protathlima, xronia, match_date, home, odd_1, odd_x, odd_2, away, score, score_1, score_2, simeio, favori, under_over = each_match
previous_matches = bet365_matches[:index]
home_1, home_x, home_2 = [], [], [] # home_1, home_x, home_2
away_1, away_x, away_2 = [], [], [] # away_1, away_x, away_2
home_all_yrs_protathlima, away_all_yrs_protathlima = [], [] # home_all_yrs_protathlima, away_all_yrs_protathlima
home_forma_last_6_home, away_forma_last_6_away = [], [] # home_forma_last_6_home, away_forma_last_6_away
home_forma_last_6_home_away, away_forma_last_6_home_away = [], [] # home_forma_last_6_home_away, away_forma_last_6_home_away
akrivis_protathlima, akrivis_genika = [], [] # akrivis_protathlima, akrivis_genika
mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away = [], [] # mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away
for each_item in previous_matches:
if each_item[3] == home:
# home_1, home_x, home_2
if each_item[4] == odd_1:
home_1.append(each_item[11])
if each_item[5] == odd_x:
home_x.append(each_item[11])
if each_item[6] == odd_2:
home_2.append(each_item[11])
# home_all_yrs_protathlima
if each_item[0] == protathlima:
home_all_yrs_protathlima.append(each_item[11])
# home_forma_last_6_home
if each_item[1] == xronia:
home_forma_last_6_home.append(each_item[11])
home_forma_last_6_home_away.append(each_item[11])
mesos_oros_goal_home_last_6_home.append(float(each_item[9]))
if each_item[7] == home:
if each_item[0] == protathlima:
# home_forma_last_6_home_away
if each_item[1] == xronia:
home_forma_last_6_home_away.append(each_item[11])
if each_item[3] == away:
if each_item[0] == protathlima:
# away_forma_last_6_away
if each_item[1] == xronia:
away_forma_last_6_home_away.append(each_item[11])
if each_item[7] == away:
# away_1, away_x, away_2
if each_item[4] == odd_1:
away_1.append(each_item[11])
if each_item[5] == odd_x:
away_x.append(each_item[11])
if each_item[6] == odd_2:
away_2.append(each_item[11])
# away_all_yrs_protathlima
if each_item[0] == protathlima:
away_all_yrs_protathlima.append(each_item[11])
# away_forma_last_6_away
if each_item[1] == xronia:
away_forma_last_6_away.append(each_item[11])
away_forma_last_6_home_away.append(each_item[11])
mesos_oros_goal_away_last_6_away.append(float(each_item[10]))
# akrivis_protathlima, akrivis_genika
if each_item[4] == odd_1 and each_item[5] == odd_x and each_item[6] == odd_2:
akrivis_genika.append(each_item[11])
if each_item[0] == protathlima:
akrivis_protathlima.append(each_item[11])
stop_filter = time.time() - start_each
print round(stop_filter, 6),
# Calculate statistics
# home_1, home_x, home_2
home_1 = get_shmeio_stats(home_1, shmeia_list)
home_x = get_shmeio_stats(home_x, shmeia_list)
home_2 = get_shmeio_stats(home_2, shmeia_list)
# away_1, away_x, away_2
away_1 = get_shmeio_stats(away_1, shmeia_list)
away_x = get_shmeio_stats(away_x, shmeia_list)
away_2 = get_shmeio_stats(away_2, shmeia_list)
# home_all_yrs_protathlima, away_all_yrs_protathlima
home_all_yrs_protathlima = get_shmeio_stats(home_all_yrs_protathlima, shmeia_list)
away_all_yrs_protathlima = get_shmeio_stats(away_all_yrs_protathlima, shmeia_list)
# home_forma_last_6_home, away_forma_last_6_away
home_forma_last_6_home = get_shmeio_stats(home_forma_last_6_home[-6:], shmeia_list)
away_forma_last_6_away = get_shmeio_stats(away_forma_last_6_away[-6:], shmeia_list)
# home_forma_last_6_home_away, away_forma_last_6_home_away
home_forma_last_6_home_away = get_shmeio_stats(home_forma_last_6_home_away[-6:], shmeia_list)
away_forma_last_6_home_away = get_shmeio_stats(away_forma_last_6_home_away[-6:], shmeia_list)
# akrivis_protathlima, akrivis_genika
akrivis_protathlima = get_shmeio_stats(akrivis_protathlima, shmeia_list)
akrivis_genika = get_shmeio_stats(akrivis_genika, shmeia_list)
# mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away
try:
if mesos_oros_goal_home_last_6_home:
mesos_oros_goal_home_last_6_home = round(np.average(mesos_oros_goal_home_last_6_home[-6:]), 2)
else:
mesos_oros_goal_home_last_6_home = '-'
except:
mesos_oros_goal_home_last_6_home = '-'
try:
if mesos_oros_goal_away_last_6_away:
mesos_oros_goal_away_last_6_away = round(np.average(mesos_oros_goal_away_last_6_away[-6:]), 2)
else:
mesos_oros_goal_away_last_6_away = '-'
except:
mesos_oros_goal_away_last_6_away = '-'
stop_function = time.time() - start_each
print round(stop_function, 6),
match_stats = [id, protathlima, xronia, match_date, home, odd_1, odd_x,
odd_2, away, score, score_1, score_2, simeio,
favori, under_over, home_1, home_x, home_2, away_1,
away_x, away_2, home_all_yrs_protathlima,
away_all_yrs_protathlima, home_forma_last_6_home,
away_forma_last_6_away, home_forma_last_6_home_away,
away_forma_last_6_home_away, akrivis_protathlima,
akrivis_genika, mesos_oros_goal_home_last_6_home,
mesos_oros_goal_away_last_6_away]
bet365_data_stats.append(match_stats)
stop_each = time.time() - start_each
print round(stop_each, 6)
stop = time.time() - start
print 'Completed in:', stop
with open('BET365_stats_loop.csv', 'wb') as f:
bet365_stats = csv.writer(f)
bet365_stats.writerows(bet365_data_stats)
这是我的csv的一部分:
我已将它运行到180 000行,每行最多花费0.3秒。
您是否认为由于数据的数量无论我选择什么,整个过程都会花费太长时间?
更新最后,我在需要时使用字典添加新密钥并相应地更新它们。它的速度非常快。
答案 0 :(得分:3)
您的受欢迎程度非常高,因为对于您的列表中的每一次迭代,直到索引K,您都会在列表上迭代到索引K-1(previous_matches
)ifilter> 16次。这非常低效。您应该只迭代previous_matches
一次,并在您一次迭代时将所有列表(home_1
,home_x
等)构建起来。
所以看起来应该是这样的:
for index, each_match in enumerate(bet365_matches):
id = index
protathlima, xronia, match_date, home, odd_1, odd_x, odd_2, away, score, score_1, score_2, simeio, favori, under_over = each_match
previous_matches = bet365_matches[:index]
home_1 = []
home_x = []
home_2 = []
# And the rest below
for item in previous_matches:
# home
if item[3] == home:
if item[4] == odd_1:
home_1.append(item)
if item[5] == odd_x:
home_x = append(item)
if item[6] == odd_2:
home_2 = append(item)
# Same pattern for everything else
home_x = get_shmeio_stats(home_x, shmeia_list)
home_1 = get_shmeio_stats(home_1, shmeia_list)
home_2 = get_shmeio_stats(home_2, shmeia_list)
# And the rest
答案 1 :(得分:0)
(这个答案很通用,因为你的问题没有具体细节。)
确保您没有两次走过数据。例如,如果您的处理如下所示:
output = [step2(i) for i in [step(1) j for j in get_list()]]
或
tmp = [step1(i) for i in get_list()]
tmp = [step2(i) for i in tmp]
tmp = [i for i in tmp if cond(i)]
#...
将这些步骤合并为一个会更好:
output = [step2(step(1)) for i in get_list()]
或使用generator expressions代替列表推导:
tmp = (step1(i) for i in get_list())
tmp = (step2(i) for i in tmp)
tmp = (i for i in tmp if cond(i))
#...
output = list(tmp) # if you need the output as a list