Question

我有一个400万行的csv。对于每一行，我采用前面的所有行，我根据当前行的多个条件对它们进行过滤，并创建一些统计信息。总共需要花费太多时间，所以我试图找到一种加速它的方法。所以我介于以下几个方面：

一个sqlite3数据库
使用pandas
将itetools.ifilter与csv文件一起使用

以下是我的实际代码，使用如下所述的for循环：

 # -*- coding: utf-8 -*-

import csv
import numpy as np
import time
import itertools


# Functions

def get_shmeio_stats(data, reference_list):
    shmeio_stats = '-'
    if data:
        tally = (data.count(i) for i in reference_list)
        shmeio_stats = '-'.join(map(str, tally))
    else:
        shmeio_stats ='-'

    if shmeio_stats == '0-0-0':
                shmeio_stats ='-'
    return shmeio_stats


bet365_data_stats = []
shmeia_list = ['1', 'x', '2']
with open('BET365.csv', 'rb') as f:
    bet365_data = csv.reader(f)
    bet365_matches = list(bet365_data)[1:]

    start = time.time()
    for index, each_match in enumerate(bet365_matches):
        print index,
        start_each = time.time()
        id = index
#             print index
        protathlima, xronia, match_date, home, odd_1, odd_x, odd_2, away, score, score_1, score_2, simeio, favori, under_over = each_match
        previous_matches = bet365_matches[:index]

        home_1, home_x, home_2 = [], [], [] # home_1, home_x, home_2
        away_1, away_x, away_2 = [], [], [] # away_1, away_x, away_2
        home_all_yrs_protathlima, away_all_yrs_protathlima = [], [] # home_all_yrs_protathlima, away_all_yrs_protathlima
        home_forma_last_6_home, away_forma_last_6_away = [], [] # home_forma_last_6_home, away_forma_last_6_away
        home_forma_last_6_home_away, away_forma_last_6_home_away = [], [] # home_forma_last_6_home_away, away_forma_last_6_home_away
        akrivis_protathlima, akrivis_genika = [], [] # akrivis_protathlima, akrivis_genika
        mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away = [], [] # mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away

        for each_item in previous_matches:
            if each_item[3] == home:

                # home_1, home_x, home_2
                if each_item[4] == odd_1:
                    home_1.append(each_item[11])
                if each_item[5] == odd_x:
                    home_x.append(each_item[11])
                if each_item[6] == odd_2:
                    home_2.append(each_item[11])

                # home_all_yrs_protathlima
                if each_item[0] == protathlima:
                    home_all_yrs_protathlima.append(each_item[11])

                    # home_forma_last_6_home
                    if each_item[1] == xronia:
                        home_forma_last_6_home.append(each_item[11])
                        home_forma_last_6_home_away.append(each_item[11])
                        mesos_oros_goal_home_last_6_home.append(float(each_item[9]))

            if each_item[7] == home:
                if each_item[0] == protathlima:                    
                    # home_forma_last_6_home_away
                    if each_item[1] == xronia:
                        home_forma_last_6_home_away.append(each_item[11])

            if each_item[3] == away:
                if each_item[0] == protathlima:
                     # away_forma_last_6_away
                    if each_item[1] == xronia:
                        away_forma_last_6_home_away.append(each_item[11])

            if each_item[7] == away:

                # away_1, away_x, away_2
                if each_item[4] == odd_1:
                    away_1.append(each_item[11])
                if each_item[5] == odd_x:
                    away_x.append(each_item[11])
                if each_item[6] == odd_2:
                    away_2.append(each_item[11])

                # away_all_yrs_protathlima
                if each_item[0] == protathlima:
                    away_all_yrs_protathlima.append(each_item[11])

                     # away_forma_last_6_away
                    if each_item[1] == xronia:
                        away_forma_last_6_away.append(each_item[11])
                        away_forma_last_6_home_away.append(each_item[11])
                        mesos_oros_goal_away_last_6_away.append(float(each_item[10]))

            # akrivis_protathlima, akrivis_genika      
            if each_item[4] == odd_1 and each_item[5] == odd_x and each_item[6] == odd_2:
                akrivis_genika.append(each_item[11])
                if each_item[0] == protathlima:
                    akrivis_protathlima.append(each_item[11])

        stop_filter = time.time() - start_each
        print round(stop_filter, 6),

        # Calculate statistics

        # home_1, home_x, home_2
        home_1 = get_shmeio_stats(home_1, shmeia_list)
        home_x = get_shmeio_stats(home_x, shmeia_list)
        home_2 = get_shmeio_stats(home_2, shmeia_list)    

        # away_1, away_x, away_2
        away_1 = get_shmeio_stats(away_1, shmeia_list)
        away_x = get_shmeio_stats(away_x, shmeia_list)
        away_2 = get_shmeio_stats(away_2, shmeia_list)

        # home_all_yrs_protathlima, away_all_yrs_protathlima
        home_all_yrs_protathlima = get_shmeio_stats(home_all_yrs_protathlima, shmeia_list)
        away_all_yrs_protathlima = get_shmeio_stats(away_all_yrs_protathlima, shmeia_list)

        # home_forma_last_6_home, away_forma_last_6_away
        home_forma_last_6_home = get_shmeio_stats(home_forma_last_6_home[-6:], shmeia_list)
        away_forma_last_6_away = get_shmeio_stats(away_forma_last_6_away[-6:], shmeia_list)

        # home_forma_last_6_home_away, away_forma_last_6_home_away
        home_forma_last_6_home_away = get_shmeio_stats(home_forma_last_6_home_away[-6:], shmeia_list)
        away_forma_last_6_home_away = get_shmeio_stats(away_forma_last_6_home_away[-6:], shmeia_list)

        # akrivis_protathlima, akrivis_genika
        akrivis_protathlima = get_shmeio_stats(akrivis_protathlima, shmeia_list)
        akrivis_genika = get_shmeio_stats(akrivis_genika, shmeia_list)

        # mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away
        try:
            if mesos_oros_goal_home_last_6_home:
                    mesos_oros_goal_home_last_6_home = round(np.average(mesos_oros_goal_home_last_6_home[-6:]), 2)
            else:
                mesos_oros_goal_home_last_6_home = '-'
        except:
            mesos_oros_goal_home_last_6_home = '-'

        try:
            if mesos_oros_goal_away_last_6_away:
                    mesos_oros_goal_away_last_6_away = round(np.average(mesos_oros_goal_away_last_6_away[-6:]), 2)
            else:
                mesos_oros_goal_away_last_6_away = '-'
        except:
            mesos_oros_goal_away_last_6_away = '-'

        stop_function = time.time() - start_each
        print round(stop_function, 6),


        match_stats = [id, protathlima, xronia, match_date, home, odd_1, odd_x,
                odd_2, away, score, score_1, score_2, simeio,
                favori, under_over, home_1, home_x, home_2, away_1,
                away_x, away_2, home_all_yrs_protathlima, 
                away_all_yrs_protathlima, home_forma_last_6_home,
                away_forma_last_6_away, home_forma_last_6_home_away,
                away_forma_last_6_home_away, akrivis_protathlima, 
                akrivis_genika, mesos_oros_goal_home_last_6_home,
                mesos_oros_goal_away_last_6_away]

        bet365_data_stats.append(match_stats)

        stop_each = time.time() - start_each
        print round(stop_each, 6)

    stop = time.time() - start
    print 'Completed in:', stop

with open('BET365_stats_loop.csv', 'wb') as f:
    bet365_stats = csv.writer(f)
    bet365_stats.writerows(bet365_data_stats)

这是我的csv的一部分： enter image description here

我已将它运行到180 000行，每行最多花费0.3秒。

您是否认为由于数据的数量无论我选择什么，整个过程都会花费太长时间？

更新最后，我在需要时使用字典添加新密钥并相应地更新它们。它的速度非常快。

Answer 1

您的受欢迎程度非常高，因为对于您的列表中的每一次迭代，直到索引K，您都会在列表上迭代到索引K-1（previous_matches）ifilter> 16次。这非常低效。您应该只迭代previous_matches一次，并在您一次迭代时将所有列表（home_1，home_x等）构建起来。

所以看起来应该是这样的：

 for index, each_match in enumerate(bet365_matches):
        id = index
        protathlima, xronia, match_date, home, odd_1, odd_x, odd_2, away, score, score_1, score_2, simeio, favori, under_over = each_match
        previous_matches = bet365_matches[:index]

        home_1 = []
        home_x = []
        home_2 = []
        # And the rest below
        for item in previous_matches:
            # home
            if item[3] == home:
                if item[4] == odd_1:
                    home_1.append(item)
                if item[5] == odd_x:
                    home_x = append(item)
                if item[6] == odd_2:
                    home_2 = append(item)
            # Same pattern for everything else

        home_x = get_shmeio_stats(home_x, shmeia_list)
        home_1 = get_shmeio_stats(home_1, shmeia_list)
        home_2 = get_shmeio_stats(home_2, shmeia_list)
        # And the rest

Answer 2

（这个答案很通用，因为你的问题没有具体细节。）

确保您没有两次走过数据。例如，如果您的处理如下所示：

output = [step2(i) for i in [step(1) j for j in get_list()]]

或

tmp = [step1(i) for i in get_list()]
tmp = [step2(i) for i in tmp]
tmp = [i for i in tmp if cond(i)]
#...

将这些步骤合并为一个会更好：

output = [step2(step(1)) for i in get_list()]

或使用generator expressions代替列表推导：

tmp = (step1(i) for i in get_list())
tmp = (step2(i) for i in tmp)
tmp = (i for i in tmp if cond(i))
#...
output = list(tmp) # if you need the output as a list

如何在python中优化大型列表的过滤？

2 个答案: