Question

我的大型xls文件存在很大问题。当我的应用添加新的统计记录（文件末尾的新行）时，会有很长的时间（一分钟）。如果我用空的xls文件替换它，这工作最好（1-2秒）。所以，如果可能的话，我会尝试优化它。

我用的是：

def add_stats_record():
    # Add record
    lock = LockFile(STATS_FILE)
    with lock:
        # Open for read
        rb = open_workbook(STATS_FILE, formatting_info=True)
        sheet_records = rb.sheet_by_index(0)

        # record_id
        START_ROW = sheet_records.nrows
        try:
            record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
        except:
            record_id = 1

        # Open for write
        wb = copy(rb)
        sheet_records = wb.get_sheet(0)

        # Set normal style
        style_normal = xlwt.XFStyle()
        normal_font = xlwt.Font()
        style_normal.font = normal_font

        # Prepare some data here
        ........................
        # then:

        for i, col in enumerate(SHEET_RECORDS_COLS):
            sheet_records.write(START_ROW, i, possible_values.get(col[0], ''),
                                style_normal)

        wb.save(STATS_FILE)

你在这里看到有待改进的地方吗？或者你能给我一个更好的想法/例子如何做到这一点？

Answer 1

可能不是你想听到的答案，但几乎没有什么可以优化的。

import xlwt, xlrd
from xlutils.copy import copy as copy
from time import time

def add_stats_record():
    #Open for read
    start_time = time()
    rb = xlrd.open_workbook(STATS_FILE, formatting_info=True)
    sheet_records_original = rb.sheet_by_index(0)
    print('Elapsed time for opening:            %.2f' % (time()-start_time))
    #Record_id
    start_time = time()
    START_ROW = sheet_records_original.nrows
    SHEET_RECORDS_COLS = sheet_records_original.ncols
    try:
        record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
    except:
        record_id = 1
    print('Elapsed time for record ID:          %.2f' % (time()-start_time))
    #Open for write
    start_time = time()
    wb = copy(rb)
    sheet_records = wb.get_sheet(0)
    print('Elapsed time for write:              %.2f' % (time()-start_time))
    #Set normal style
    style_normal = xlwt.XFStyle()
    normal_font = xlwt.Font()
    style_normal.font = normal_font

    #Read all the data and get some stats
    start_time = time()
    max_col = {}
    start_time = time()
    for col_idx in range(0,16):
        max_value = 0
        for row_idx in range(START_ROW):
            if sheet_records_original.cell(row_idx, col_idx).value:
                val = float(sheet_records_original.cell(row_idx, col_idx).value)
                if val > max_value:
                    max_col[col_idx] = str(row_idx) + ';' + str(col_idx)

    text_cells = [[0 for x in range(15)] for y in range(START_ROW)] 
    for col_idx in range(16,31):
        max_value = 0
        for row_idx in range(START_ROW):
            if sheet_records_original.cell(row_idx, col_idx).value:
                val = str(sheet_records_original.cell(row_idx, col_idx).value).replace('text', '').count(str(col_idx))
                if val > max_value:
                    max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
    print('Elapsed time for reading data/stats: %.2f' % (time()-start_time))
    #Write the stats row
    start_time = time()
    for i in range(SHEET_RECORDS_COLS):
        sheet_records.write(START_ROW, i, max_col[i], style_normal)

    start_time = time()
    wb.save(STATS_FILE)
    print('Elapsed time for writing:            %.2f' % (time()-start_time))    

if __name__ == '__main__':
    STATS_FILE = 'output.xls'
    start_time2 = time()
    add_stats_record()
    print ('Total time:                         %.2f' % (time() - start_time2))

开放时间：2.43
  记录ID的经过时间：0.00
  经过的时间：7.62
  读取数据/统计数据的经过时间：2.35
  经过的写作时间：3.33
  总时间：15.75

从这些结果可以清楚地看出，您的代码几乎没有任何改进空间。打开/复制/写入弥补了批量时间，但只是简单地调用xlrd/xlwt。

在open_workbook中使用on_demand=True也无济于事。

使用openpyxl也不会提高效果。

from openpyxl import load_workbook
from time import time

#Load workbook
start_time = time()
wb = load_workbook('output.xlsx')
print('Elapsed time for loading workbook: %.2f' % (time.time()-start_time))    

#Read all data
start_time = time()
ws = wb.active
cell_range1 = ws['A1':'P20001']
cell_range2 = ws['Q1':'AF20001']
print('Elapsed time for reading workbook: %.2f' % (time.time()-start_time))    

#Save to a new workbook
start_time = time()
wb.save("output_tmp.xlsx")
print('Elapsed time for saving workbook:  %.2f' % (time.time()-start_time))

加载工作簿的经过时间：22.35
  阅读工作簿的经过时间：0.00
  保存工作簿的经过时间：21.11

Ubuntu 14.04（虚拟机）/Python2.7-64bit/Regular硬盘（与原生Windows 10类似的结果，Python 3在加载方面表现更差但写作更好）。

使用Pandas和Numpy生成随机数据

import pandas as pd
import numpy as np
#just random numbers
df = pd.DataFrame(np.random.rand(20000,30), columns=range(0,30))
#convert half the columns to text
for i in range(15,30):
    df[i].apply(str)
    df[i] = 'text' + df[i].astype(str)
writer = pd.ExcelWriter(STATS_FILE)
df.to_excel(writer,'Sheet1')
writer.save()

在摆弄multiprocessing之后，我发现了一个略微改进的解决方案。由于copy操作是最耗时的操作并且共享workbook性能更差，因此采用了不同的方法。两个线程都读取原始工作簿，一个读取数据，计算统计信息并将它们写入文件（tmp.txt），另一个复制工作簿，等待统计文件出现，然后将其写入新的复制了工作簿。

差异：总共需要的时间减少12％（两个脚本n = 3）。不是很好，但除了不使用Excel文件外，我想不出另一种做法。

<强> xls_copy.py

def xls_copy(STATS_FILE, START_ROW, style_normal):
    from xlutils.copy import copy as copy
    from time import sleep, time
    from os import stat
    from xlrd import open_workbook
    print('started 2nd thread')
    start_time = time()
    rb = open_workbook(STATS_FILE, formatting_info=True)
    wb = copy(rb)
    sheet_records = wb.get_sheet(0)
    print('2: Elapsed time for xls_copy:         %.2f' % (time()-start_time))

    counter = 0
    filesize = stat('tmp.txt').st_size

    while filesize == 0 and counter < 10**5:
        sleep(0.01)
        filesize = stat('tmp.txt').st_size
        counter +=1
    with open('tmp.txt', 'r') as f:
        for line in f.readlines():
            cells = line.split(';')
            sheet_records.write(START_ROW, int(cells[0]), cells[1], style_normal)

    start_time = time()
    wb.save('tmp_' + STATS_FILE)
    print('2: Elapsed time for writing:          %.2f' % (time()-start_time))

<强> xlsx_multi.py

from xls_copy import xls_copy
import xlwt, xlrd
from time import time
from multiprocessing import Process

def add_stats_record():

    #Open for read
    start_time = time()
    rb = xlrd.open_workbook(STATS_FILE, formatting_info=True)
    sheet_records_original = rb.sheet_by_index(0)
    print('Elapsed time for opening:            %.2f' % (time()-start_time))
    #Record_id
    start_time = time()
    START_ROW = sheet_records_original.nrows
    f = open('tmp.txt', 'w')
    f.close()
    #Set normal style
    style_normal = xlwt.XFStyle()
    normal_font = xlwt.Font()
    style_normal.font = normal_font

    #start 2nd thread
    p = Process(target=xls_copy, args=(STATS_FILE, START_ROW, style_normal,))
    p.start()
    print('continuing with 1st thread')
    SHEET_RECORDS_COLS = sheet_records_original.ncols
    try:
        record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
    except:
        record_id = 1
    print('Elapsed time for record ID:          %.2f' % (time()-start_time))

    #Read all the data and get some stats
    start_time = time()
    max_col = {}
    start_time = time()
    for col_idx in range(0,16):
        max_value = 0
        for row_idx in range(START_ROW):
            if sheet_records_original.cell(row_idx, col_idx).value:
                val = float(sheet_records_original.cell(row_idx, col_idx).value)
                if val > max_value:
                    max_col[col_idx] = str(row_idx) + ';' + str(col_idx)

    text_cells = [[0 for x in range(15)] for y in range(START_ROW)] 
    for col_idx in range(16,31):
        max_value = 0
        for row_idx in range(START_ROW):
            if sheet_records_original.cell(row_idx, col_idx).value:
                val = str(sheet_records_original.cell(row_idx, col_idx).value).replace('text', '').count(str(col_idx))
                if val > max_value:
                    max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
    #write statistics to a temp file
    with open('tmp.txt', 'w') as f:
        for k in max_col:
            f.write(str(k) + ';' + max_col[k] + str('\n'))
    print('Elapsed time for reading data/stats: %.2f' % (time()-start_time))
    p.join()
if __name__ == '__main__':

    done = False
    wb = None
    STATS_FILE = 'output.xls'
    start_time2 = time()
    add_stats_record()
    print ('Total time:                          %.2f' % (time() - start_time2))

使用xlwt优化xls文件中的添加行

1 个答案: