如何计算列表元组

时间:2018-03-02 08:50:55

标签: python-3.x list tuples calculated-columns

我有一个这样的列表元组:

tup_list = [('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 21.325), ('UL00628', 6.675), ('UL00628', 22.5), ('UL00628', 5.5), ('UL00628', 15.525), ('UL00628', 12.475), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00428', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00628', 28.0), ('UL00428-OGA', 28.0)]

我想计算每个元组中的相同项目,如UL00628,UL00428? 我可以使用什么迭代?

顺便说一下,tup_list来自excel文件。代码如下:

load_all = dict()
file_name ='***.xls'
wb = xlrd.open_workbook('d:\**%s'%file_name)
table = wb.sheet_by_name('***')
date_start_month= int(input('Pls enter the date of start month:'))
date_start_day= int(input('Pls enter the date of start day:'))
date_end_month= int(input('Pls enter the date of end month:'))
date_end_day = int(input('Pls enter the date of end day:'))
count = 0
tup_list = list()
tup = tuple()
nrows = table.nrows
if table.cell(1, 11).value == '****':
    for num in range(2,nrows):
        date_of_load = table.cell(num,11).value #this is a date value
        #print(date_of_load)
        year,month,day,hous,minute,second = xlrd.xldate.xldate_as_tuple(date_of_load,0) # date is a tuple, (y,m,d,h,min,second) 
        if month in range(date_start_month,date_end_month+1) :# the month is OK?
            if day in range(date_start_day, date_end_day+1): # find the day
                grade_name = table.cell(num,3).value #grade
                grade_num = table.cell(num,5).value #quanlity
                tup = (grade_name, grade_num)
                tup_list.append(tup)
                count +=1

^^^^^^^^^ 这是原始数据

NO  Grade   quantity    Loadday
9   UL00628 28.0000     2018/2/7
10  UL00628 28.0000     2018/2/7
11  UL00628 28.0000     2018/2/7
12  EVA-OGC 28.0000     2018/2/7
13  EVA-OGC 28.0000     2018/2/7
14  UL00628 28.0000     2018/2/8
15  UL00628 28.0000     2018/2/8
16  UL00628 28.0000     2018/2/19
17  UL00628 28.0000     2018/2/19
18  UL00628 28.0000     2018/2/19
19  UL00628 28.0000     2018/2/19
20  UL00628 28.0000     2018/2/19
21  UL00628 28.0000     2018/2/19
22  UL00628 28.0000     2018/2/19
23  UL00628 28.0000     2018/2/19
24  UL00628 28.0000     2018/2/20
25  UL00628 28.0000     2018/2/20
26  UL00628 28.0000     2018/2/20
27  UL00628 28.0000     2018/2/20
28  UL00628 28.0000     2018/2/20

我需要找出正确的装载日,然后得到等级和数量,并计算每个等级。

1 个答案:

答案 0 :(得分:0)

对于数据修改,特别是涉及Excel或CSV文件,我会使用pandas而不是直接使用openpyxl

除此之外,一旦你有了元组列表,就可以使用defaultdict

from collections import defaultdict
results = defaultdict(list)
for grade, quantity in tup_list:
    results[grade].append(quantity)

其他问题

我会做的其他改变是

  • 将程序的不同部分放在单独的函数中
  • 确保输入收集可以在传入非int值时处理,
  • 将输入放在dict中,因此,如果有一天您从另一个脚本或程序的一部分获得所需数据的输入,则可以轻松地重复使用
  • 尽可能使用生成器而不是返回列表的函数
  • 使用with声明
  • 打开资源
  • 使用pathlib.Path处理文件和文件名
  • 使用if __name__ == "__main__"

测试

from collections import defaultdict
from pathlib import Path 
import xlrd

def get_int_inputs(questions):
    for key, msg in questions.items():
        answer = None
        while not answer:
            try:
                answer = int(input(msg))
                yield key, answer
            except ValueError:
                pass


def parse_file(filename, inputs):
    with xlrd.open_workbook(filename) as wb:
        table = wb.sheet_by_name('***')
        if table.cell(1, 11).value == '****':
            for num in range(2,nrows):
                year, month, day, *_ = xlrd.xldate.xldate_as_tuple(date_of_load, 0)
                if not inputs['date_start_month'] < month <= inputs['date_end_month']:
                    continue
                if not inputs['date_start_day'] < day <= inputs['date_end_day']:
                    continue
                grade_name = table.cell(num,3).value
                grade_num = table.cell(num,5).value
                yield grade_name, grade_num


def aggregate(quantities):
    results = defaultdict(list)
    for grade_name, grade_num in quantities:
        results[grade_name].append(grade_num)
    return {grade_name: sum(val) for grade_name, val in results.items()}


if __name__ == '__main__':

    wanted_input = {
        'date_start_month': 'Pls enter the date of start month:',
        'date_start_day': 'Pls enter the date of start day:',
        'date_end_month': 'Pls enter the date of end month:',
        'date_end_day': 'Pls enter the date of end day:',
    }
    inputs = dict(get_int_inputs(wanted_input))

    filename = Path('D:/' , '***.xls')
    quantities = parse_file(filename, inputs)
    result = aggregate(quantities)

没有样本数据,我无法测试代码,因此可能充满了错误

熊猫

另一种方法是使用pandas进行数据处理

然后你会得到类似的东西

from pathlib import Path 
import pandas as pd

def parse_data(df, inputs):
    if df.columns[11] != '****':  # index might be different, depending on whether there is an index-col and 0- or 1-based indexing
        return None

    dates = df[<date_column_label>]   
    # or if it needs conversion to datetime
    # dates = pd.to_datetime(df[<date_column_label>])
    date_correct = dates.dt.month.between(
            inputs['date_start_month'], 
            inputs['date_end_month'] + 1, 
            inclusive = False,
            ) & dates.dt.day.between(
            inputs['date_start_day'], 
            inputs['date_end_day'] + 1, 
            inclusive = False,
            )
    return df[date_correct].groupby(<grade_name_label>)[<quantity_label>].sum()

if __name__ == '__main__':
    wanted_input = {
        'date_start_month': 'Pls enter the date of start month:',
        'date_start_day': 'Pls enter the date of start day:',
        'date_end_month': 'Pls enter the date of end month:',
        'date_end_day': 'Pls enter the date of end day:',
    }
    inputs = dict(get_int_inputs(wanted_input))

    filename = Path('D:/' , '***.xls')
    df = pd.read_excel(filename, sheet_name='', header=0)
    result = parse_data(df)