我的大型xls文件存在很大问题。当我的应用添加新的统计记录(文件末尾的新行)时,会有很长的时间(一分钟)。如果我用空的xls文件替换它,这工作最好(1-2秒)。所以,如果可能的话,我会尝试优化它。
我用的是:
def add_stats_record():
# Add record
lock = LockFile(STATS_FILE)
with lock:
# Open for read
rb = open_workbook(STATS_FILE, formatting_info=True)
sheet_records = rb.sheet_by_index(0)
# record_id
START_ROW = sheet_records.nrows
try:
record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
except:
record_id = 1
# Open for write
wb = copy(rb)
sheet_records = wb.get_sheet(0)
# Set normal style
style_normal = xlwt.XFStyle()
normal_font = xlwt.Font()
style_normal.font = normal_font
# Prepare some data here
........................
# then:
for i, col in enumerate(SHEET_RECORDS_COLS):
sheet_records.write(START_ROW, i, possible_values.get(col[0], ''),
style_normal)
wb.save(STATS_FILE)
你在这里看到有待改进的地方吗?或者你能给我一个更好的想法/例子如何做到这一点?
答案 0 :(得分:3)
可能不是你想听到的答案,但几乎没有什么可以优化的。
import xlwt, xlrd
from xlutils.copy import copy as copy
from time import time
def add_stats_record():
#Open for read
start_time = time()
rb = xlrd.open_workbook(STATS_FILE, formatting_info=True)
sheet_records_original = rb.sheet_by_index(0)
print('Elapsed time for opening: %.2f' % (time()-start_time))
#Record_id
start_time = time()
START_ROW = sheet_records_original.nrows
SHEET_RECORDS_COLS = sheet_records_original.ncols
try:
record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
except:
record_id = 1
print('Elapsed time for record ID: %.2f' % (time()-start_time))
#Open for write
start_time = time()
wb = copy(rb)
sheet_records = wb.get_sheet(0)
print('Elapsed time for write: %.2f' % (time()-start_time))
#Set normal style
style_normal = xlwt.XFStyle()
normal_font = xlwt.Font()
style_normal.font = normal_font
#Read all the data and get some stats
start_time = time()
max_col = {}
start_time = time()
for col_idx in range(0,16):
max_value = 0
for row_idx in range(START_ROW):
if sheet_records_original.cell(row_idx, col_idx).value:
val = float(sheet_records_original.cell(row_idx, col_idx).value)
if val > max_value:
max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
text_cells = [[0 for x in range(15)] for y in range(START_ROW)]
for col_idx in range(16,31):
max_value = 0
for row_idx in range(START_ROW):
if sheet_records_original.cell(row_idx, col_idx).value:
val = str(sheet_records_original.cell(row_idx, col_idx).value).replace('text', '').count(str(col_idx))
if val > max_value:
max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
print('Elapsed time for reading data/stats: %.2f' % (time()-start_time))
#Write the stats row
start_time = time()
for i in range(SHEET_RECORDS_COLS):
sheet_records.write(START_ROW, i, max_col[i], style_normal)
start_time = time()
wb.save(STATS_FILE)
print('Elapsed time for writing: %.2f' % (time()-start_time))
if __name__ == '__main__':
STATS_FILE = 'output.xls'
start_time2 = time()
add_stats_record()
print ('Total time: %.2f' % (time() - start_time2))
开放时间:2.43
记录ID的经过时间:0.00
经过的时间:7.62
读取数据/统计数据的经过时间:2.35
经过的写作时间:3.33
总时间:15.75
从这些结果可以清楚地看出,您的代码几乎没有任何改进空间。打开/复制/写入弥补了批量时间,但只是简单地调用xlrd/xlwt
。
在open_workbook
中使用on_demand=True
也无济于事。
使用openpyxl
也不会提高效果。
from openpyxl import load_workbook
from time import time
#Load workbook
start_time = time()
wb = load_workbook('output.xlsx')
print('Elapsed time for loading workbook: %.2f' % (time.time()-start_time))
#Read all data
start_time = time()
ws = wb.active
cell_range1 = ws['A1':'P20001']
cell_range2 = ws['Q1':'AF20001']
print('Elapsed time for reading workbook: %.2f' % (time.time()-start_time))
#Save to a new workbook
start_time = time()
wb.save("output_tmp.xlsx")
print('Elapsed time for saving workbook: %.2f' % (time.time()-start_time))
加载工作簿的经过时间:22.35
阅读工作簿的经过时间:0.00
保存工作簿的经过时间:21.11
Ubuntu 14.04(虚拟机)/Python2.7-64bit/Regular硬盘(与原生Windows 10类似的结果,Python 3在加载方面表现更差但写作更好)。
使用Pandas和Numpy生成随机数据
import pandas as pd
import numpy as np
#just random numbers
df = pd.DataFrame(np.random.rand(20000,30), columns=range(0,30))
#convert half the columns to text
for i in range(15,30):
df[i].apply(str)
df[i] = 'text' + df[i].astype(str)
writer = pd.ExcelWriter(STATS_FILE)
df.to_excel(writer,'Sheet1')
writer.save()
在摆弄multiprocessing
之后,我发现了一个略微改进的解决方案。由于copy
操作是最耗时的操作并且共享workbook
性能更差,因此采用了不同的方法。两个线程都读取原始工作簿,一个读取数据,计算统计信息并将它们写入文件(tmp.txt
),另一个复制工作簿,等待统计文件出现,然后将其写入新的复制了工作簿。
差异:总共需要的时间减少12%(两个脚本n = 3)。不是很好,但除了不使用Excel文件外,我想不出另一种做法。
<强> xls_copy.py 强>
def xls_copy(STATS_FILE, START_ROW, style_normal):
from xlutils.copy import copy as copy
from time import sleep, time
from os import stat
from xlrd import open_workbook
print('started 2nd thread')
start_time = time()
rb = open_workbook(STATS_FILE, formatting_info=True)
wb = copy(rb)
sheet_records = wb.get_sheet(0)
print('2: Elapsed time for xls_copy: %.2f' % (time()-start_time))
counter = 0
filesize = stat('tmp.txt').st_size
while filesize == 0 and counter < 10**5:
sleep(0.01)
filesize = stat('tmp.txt').st_size
counter +=1
with open('tmp.txt', 'r') as f:
for line in f.readlines():
cells = line.split(';')
sheet_records.write(START_ROW, int(cells[0]), cells[1], style_normal)
start_time = time()
wb.save('tmp_' + STATS_FILE)
print('2: Elapsed time for writing: %.2f' % (time()-start_time))
<强> xlsx_multi.py 强>
from xls_copy import xls_copy
import xlwt, xlrd
from time import time
from multiprocessing import Process
def add_stats_record():
#Open for read
start_time = time()
rb = xlrd.open_workbook(STATS_FILE, formatting_info=True)
sheet_records_original = rb.sheet_by_index(0)
print('Elapsed time for opening: %.2f' % (time()-start_time))
#Record_id
start_time = time()
START_ROW = sheet_records_original.nrows
f = open('tmp.txt', 'w')
f.close()
#Set normal style
style_normal = xlwt.XFStyle()
normal_font = xlwt.Font()
style_normal.font = normal_font
#start 2nd thread
p = Process(target=xls_copy, args=(STATS_FILE, START_ROW, style_normal,))
p.start()
print('continuing with 1st thread')
SHEET_RECORDS_COLS = sheet_records_original.ncols
try:
record_id = int(sheet_records.cell(START_ROW - 1, 0).value) + 1
except:
record_id = 1
print('Elapsed time for record ID: %.2f' % (time()-start_time))
#Read all the data and get some stats
start_time = time()
max_col = {}
start_time = time()
for col_idx in range(0,16):
max_value = 0
for row_idx in range(START_ROW):
if sheet_records_original.cell(row_idx, col_idx).value:
val = float(sheet_records_original.cell(row_idx, col_idx).value)
if val > max_value:
max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
text_cells = [[0 for x in range(15)] for y in range(START_ROW)]
for col_idx in range(16,31):
max_value = 0
for row_idx in range(START_ROW):
if sheet_records_original.cell(row_idx, col_idx).value:
val = str(sheet_records_original.cell(row_idx, col_idx).value).replace('text', '').count(str(col_idx))
if val > max_value:
max_col[col_idx] = str(row_idx) + ';' + str(col_idx)
#write statistics to a temp file
with open('tmp.txt', 'w') as f:
for k in max_col:
f.write(str(k) + ';' + max_col[k] + str('\n'))
print('Elapsed time for reading data/stats: %.2f' % (time()-start_time))
p.join()
if __name__ == '__main__':
done = False
wb = None
STATS_FILE = 'output.xls'
start_time2 = time()
add_stats_record()
print ('Total time: %.2f' % (time() - start_time2))