python,web开发和烧瓶新手。我有一个小应用程序接受excel输入,提取数据并将其写出oracle。我的堆栈包括flask-sqlalchemy,cx_oracle。当我上传一个大约10 K行的小excel文件时,它工作得很好。当我将大量文件扔到一百万行时,它就会崩溃,我得到:
sqlalchemy.exc.operationalerror:(cx_oracle.operationalerror) ora-03114:未连接到oracle
这是我的观点功能:
@theapp.route('/upload', methods=('GET', 'POST'))
def upload():
start_time = time.time()
form = UploadForm()
if form.validate_on_submit():
file_name = secure_filename(form.upload.data.filename)
form.upload.data.save(os.path.join(theapp.config['UPLOAD_FOLDER'], file_name))
input_file = os.path.join(theapp.config['UPLOAD_FOLDER'], file_name)
wb = load_workbook(input_file, data_only = True, read_only=True, use_iterators=True)
sheet = wb.worksheets[0]
allrows = sheet.iter_rows() #this is a generator
headerobject = next(allrows) # row object for headerobject
# Column names from excel file
headerlist = []
for c in headerobject:
if c.value is not None: # If user uses delete button to delete contents of header cell, ignore it
if isinstance(c.value, str) is False: # if not a string
return bad_request("Column names have to be strings without spaces. Value in cell {} is not a string".format(c.coordinate))
if not c.value.strip(): # Empty string in header
return bad_request("Cell {} has line space(s). Delete the contents of the cell".format(c.coordinate))
headerlist.append(c.value.strip())
#print("Excel {}".format(headerlist))
duplicate_elements = [k for k,v in Counter(headerlist).items() if v >1]
if duplicate_elements: # A falsy. If there are duplicate elements
return bad_request("Column(s) {} are duplicates".format(duplicate_elements))
tableinfo = form.targettableinfo.data # tableinfo object in the databse order
tablecolumns = []
for col in tableinfo.fields:
tablecolumns.append(col.name)
#print("Table columns {}".format(tablecolumns))
# Check if the columns match
if not set(headerlist) == set(tablecolumns):
foreign_columns = [x for x in headerlist if x not in tablecolumns]
if foreign_columns:
return bad_request("Column(s) {} in your excel file are not defined".format(foreign_columns))
unfulfilled_columns = [x for x in tablecolumns if x not in headerlist]
if unfulfilled_columns:
return bad_request("Column(s) {} are expected by the table but not found in your excel file.".format(unfulfilled_columns))
# Sort the tablecolumns object in the order of the headerlist
sortedtablecolumnsobject = sorted(tableinfo.fields, key=lambda x: headerlist.index(x.name))
# create a list of dicts to hold data to be written
output_data = []
for row in allrows:
row_dict = {}
for fobject,cobject in zip(sortedtablecolumnsobject,row):
######################################
##Validation & assignment
######################################
if fobject.type.name == "Text":
if cobject.value == None:
row_dict[fobject.name] = cobject.value
else:
row_dict[fobject.name] = removeNonAscii(str(cobject.value))
#row_dict[fobject.name] = cobject.value.encode('utf-8')
#print(str(cobject.value))
elif fobject.type.name == "Number":
if dao.isnumber(cobject.value):
row_dict[fobject.name] = float(cobject.value)
#print(float(cobject.value))
elif cobject.value is None:
row_dict[fobject.name] = cobject.value
#print(cobject.value)
else:
return bad_request("Cell {} has value {}. A numeric value is expected".format(cobject.coordinate, cobject.value))
elif fobject.type.name == "Date":
if type(cobject.value) == datetime.datetime:
row_dict[fobject.name] = cobject.value
elif cobject.value is None:
row_dict[fobject.name] = cobject.value
else:
return bad_request("Cell {} does not contain a valid date".format(cobject.coordinate))
##############################################################
output_data.append(row_dict)
#print(output_data)
m = db.MetaData()
t = db.Table(tableinfo.table_name,m,autoload = True, autoload_with = db.engine)
db.engine.execute(t.insert(),output_data)
print("{} seconds" .format(time.time() - start_time))
return redirect(url_for('summary'))
return render_template('upload.html', title = 'Upload', form = form)
基本上,我创建了一个巨大的字典列表,并使用该输出列表在目标表上执行插入。就像我说的,它适用于较小的文件。对于较大的文件,我认为在尝试执行插入之前,与数据库的连接已经消失。我的堆栈跟踪显示错误源自执行插入的行。 有没有办法使数据库连接保持活动状态,即使请求耗时太长?我已经在flask-sqlalchemy中尝试了SQLALCHEMY_POOL_RECYCLE和SQLALCHEMY_POOL_TIMEOUT配置,但它没有任何意义。如何在长时间请求期间保持数据库会话/连接处于活动状态?感谢