Question

我有一份清单清单，让我们这样说：

tripInfo_csv = [['1','2',6,2], ['a','h',4,2], ['1','4',6,1], ['1','8',18,3], ['a','8',2,1]]

将子列表视为旅行：[起点，终点，成人人数，儿童人数]

我的目标是获得一个列表，其中具有重合起点和终点的行程将其第三和第四值相加。开始值和结束值应始终为1到8之间的数字。例如，如果它们是字母，则应将其替换为相应的数字（a = 1，b = 2，依此类推）。

这是我的代码。它有效，但我确信它可以改进。我的主要问题是表现。我有很多像这样的列表，还有更多的子列表。

dicPoints = {'a':'1','b':'2','c':'3', 'd':'4', 'e':'5', 'f':'6', 'g':'7', 'h':'8'}
def getTrips (trips):
    okTrips = []
    for trip in trips:
        if not trip[0].isdigit():
            trip[0] = dicPoints[trip[0]]
        if not trip[1].isdigit():
            trip[1] = dicPoints[trip[1]]

        if len(okTrips) == 0:
            okTrips.append(trip)
        else:
            for i, stop in enumerate(okTrips):
                if stop[0] == trip[0] and stop[1] == trip[1]:
                    stop[2] += trip[2]
                    stop[3] += trip[3]
                    break
                else:
                    if i == len(okTrips)-1:
                        okTrips.append(trip)

正如 eguaio 所提到的，上面的代码有一个错误。它应该是这样的：

def getTrips (trips):
    okTrips = []
    print datetime.datetime.now()
    for trip in trips:
        if not trip[0].isdigit():
            trip[0] = dicPoints[trip[0]]
        if not trip[1].isdigit():
            trip[1] = dicPoints[trip[1]]

        if len(okTrips) == 0:
            okTrips.append(trip)
        else:
            flag = 0
            for i, stop in enumerate(okTrips):
                if stop[0] == trip[0] and stop[1] == trip[1]:
                    stop[2] += trip[2]
                    stop[3] += trip[3]
                    flag = 1
                    break

            if flag == 0:
                okTrips.append(trip)

由于我想分享的eguaio的答案，我得到了改进版本。这是我的脚本基于他的答案。我的数据和要求现在比我第一次告诉的要复杂得多，所以我做了一些改动。

CSV文件如下所示：

LineT;Line;Route;Day;Start_point;End_point;Adults;Children;First_visit
SM55;5055;3;Weekend;15;87;21;4;0 
SM02;5002;8;Weekend;AF3;89;5;0;1 
...

脚本：

import os, csv, psycopg2

folder = "F:/route_project/routes"

# Day type
dicDay = {'Weekday':1,'Weekend':2,'Holiday':3}

# Dictionary with the start and end points of each route
#  built from a Postgresql table (with coumns: line_route, start, end)
conn = psycopg2.connect (database="test", user="test", password="test", host="###.###.#.##")
cur = conn.cursor()
cur.execute('select id_linroute, start_p, end_p from route_ends')
recs = cur.fetchall()
dicPoints = {rec[0]: rec[1:] for rec in recs}

# When point labels are text, replace them with a number label in dicPoints
# Text is not important: they are special text labels for start and end
#  of routes (for athletes), so we replace them with labels for start or
#  the end of each route
def convert_point(line, route, point, i):
    if point.isdigit():
        return point
    else:
        return dicPoints["%s_%s" % (line,route)][i]

# Points with text labels mean athletes made the whole or part of this route,
#  we keep them as adults but also keep this number as an extra value
#  for further purposes
def num_athletes(start_p, end_p, adults):
    if not start_p.isdigit() or not end_p.isdigit():
        return adults
    else:
        return 0

# Data is taken for CSV files in subfolders
for root, dirs, files in os.walk(folder):
    for file in files:
        if file.endswith(".csv"):
            file_path = (os.path.join(root, file))
            with open(file_path, 'rb') as csvfile:
                rows = csv.reader(csvfile, delimiter=';', quotechar='"')
                # Skips the CSV header row
                rows.next()
                # linT is not used, yet it's found in every CSV file
                # There's an unused last column in every file, I take advantage out of it
                #  to store the number of athletes in the generator
                gen =((lin, route, dicDay[tday], convert_point(lin,route,s_point,0), convert_point(lin,route,e_point,1), adults, children, num_athletes(s_point,e_point,adults)) for linT, lin, route, tday, s_point, e_point, adults, children, athletes in rows)
                dicCSV = {}
                for lin, route, tday, s_point, e_point, adults, children, athletes in gen:
                    visitors = dicCSV.get(("%s_%s_%s" % (lin,route,s_point), "%s_%s_%s" % (lin,route,e_point), tday), (0, 0, 0))
                    dicCSV[("%s_%s_%s" % (lin,route,s_point), "%s_%s_%s" % (lin,route,e_point), tday)] = (visitors[0] + int(adults), visitors[1] + int(children), visitors[2] + int(athletes))

for k,v in dicCSV.iteritems():
    print k, v

Answer 1

为了更有效地处理这个问题，最好按起点和终点对输入列表进行排序，以便将具有匹配起点和终点的行组合在一起。然后我们可以轻松使用groupby函数有效地处理这些组。

from operator import itemgetter
from itertools import groupby

tripInfo_csv = [
    ['1', '2', 6, 2], 
    ['a', 'h', 4, 2], 
    ['1', '4', 6, 1], 
    ['1', '8', 18, 3], 
    ['a', '8', 2, 1],
]

# Used to convert alphabetic point labels to numeric form
dicPoints = {v:str(i) for i, v in enumerate('abcdefgh', 1)}

def fix_points(seq):
    return [dicPoints.get(p, p) for p in seq]

# Ensure that all point labels are numeric
for row in tripInfo_csv:
    row[:2] = fix_points(row[:2])

# Sort on point labels
keyfunc = itemgetter(0, 1)
tripInfo_csv.sort(key=keyfunc)

# Group on point labels and sum corresponding adult & child numbers
newlist = []
for k, g in groupby(tripInfo_csv, key=keyfunc):
    g = list(g)
    row = list(k) + [sum(row[2] for row in g), sum(row[3] for row in g)]
    newlist.append(row)

# Print the condensed list
for row in newlist:
    print(row)

<强>输出

['1', '2', 6, 2]
['1', '4', 6, 1]
['1', '8', 24, 6]

Answer 2

对于具有大量合并的大型列表，以下内容比您的时间好得多：tripInfo_csv*500000的2秒与1分钟。我们使用dict获得几乎线性的复杂性来获取具有恒定查找时间的键。恕我直言，它也更优雅。请注意tg是一个生成器，因此在创建时不会使用大量时间或内存。

def newGetTrips(trips):

    def convert(l):
        return l if l.isdigit() else dicPoints[l]

    tg = ((convert(a), convert(b), c, d) for a, b, c, d in trips)
    okt = {}
    for a, b, c, d in tg:
        # a trick to get (0,0) as default if (a,b) is not a key of the dictionary yet
        t = okt.get((a,b), (0,0)) 
        okt[(a,b)] = (t[0] + c, t[1] + d)
    return [[a,b,c,d] for (a,b), (c,d) in okt.iteritems()]

此外，作为副作用，您正在改变行程列表，此功能使其保持不变。另外，你有一个错误。您将每个（开始，结束）对考虑的第一项求和两倍（但不是第一种情况）。我找不到原因，但在运行示例时，使用getTrips我得到了：

[['1', '2', 6, 2], ['1', '8', 28, 8], ['1', '4', 12, 2]]

和newGetTrips我得到：

[['1', '8', 24, 6], ['1', '2', 6, 2], ['1', '4', 6, 1]]

Answer 3

看看这是否有帮助

trips = [['1','2',6,2], ['a','h',4,2], ['1','2',6,1], ['1','8',18,3], ['a','h',2,1]]

# To get the equivalent value
def x(n):
    if '1' <= n <= '8':
        return int(n)
    return ord(n) - ord('a')

# To group lists with similar start and end points
from collections import defaultdict


groups = defaultdict(list)

for trip in trips:
    # Grouping based on start and end point.
    groups[(x(trip[0]), x(trip[1]))].append(trip)

grouped_trips = groups.values()

result = []
for group in grouped_trips:
    start = group[0][0]
    end = group[0][1]
    adults = group[0][2]
    children = group[0][3]
    for trip in group[1:]:
        adults += trip [2]
        children += trip [3]
    result += [[start, end, adults, children]]

print result

Answer 4

假设起点和终点在0到n之间。

然后，结果＆＃39; OkTrip＆＃39;最大n ^ 2个元素。然后，你的第二个函数循环的复杂度为O（n ^ 2）。如果您没有空间复杂性问题，可以将复杂度降低到O（n）。

Firslty，创建包含n个列表的dict，使得k＆＃39;（th）子列表包含以＆＃39; k＆＃39;开头的行程。

当您搜索是否存在具有相同起点和终点的不同行程时，您只需搜索相应的子列表而不是搜索所有元素。

这个想法来自稀疏矩阵存储技术。我无法检查以下代码的验证。

代码如下，

dicPoints = {'a':'1','b':'2','c':'3', 'd':'4', 'e':'5', 'f':'6', 'g':'7', 'h':'8'}
Temp = {'1':[],'2':[],'3':[],'4':[],'5':[],'6':[],'7':[],'8':[]};
def getTrips (trips):
   okTrips = []
   for trip in trips:
        if not trip[0].isdigit():
            trip[0] = dicPoints[trip[0]]
        if not trip[1].isdigit():
            trip[1] = dicPoints[trip[1]]

        if len(Temp[trip[0]]) == 0:
            Temp[trip[0]].append(trip)
        else:
            for i, stop in enumerate(Temp[trip[0]]):
                if stop[1] == trip[1]:
                   stop[2] += trip[2]
                   stop[3] += trip[3]
                   break
                else:
                   if i == len(Temp[trip[0]])-1:
                       Temp[trip[0]].append(trip)
        print Temp

    for key in Temp:
        okTrips = okTrips + Temp[key];

列表清单：替换和添加子列表项

4 个答案: