我正在编写一个脚本,我的队友曾在他的机器上本地构建一个SQLite3数据库。我做了一些修改,以便我们可以在Django应用程序中使用它来使用用户上传的新数据更新数据库。该应用程序允许用户上传包含多个格式良好的csv文件的zip文件,并将来自csv的信息添加到数据库中。这些是代码的相关部分:
update_db.py
import glob, sqlite3, pandas, timeit, re
def upload_files(csv_files):
conn = sqlite3.connect('/path/to/my_db.db')
c = conn.cursor()
added_tables = []
for row in c.execute("SELECT name FROM sqlite_master WHERE type='table'"):
table_name = re.sub(r'\W+', '', str(row))
added_tables.append(table_name)
for csv_filename in csv_files.namelist():
if csv_filename.endswith('.csv'):
csv_file = csv_files.open(csv_filename)
# extract team name from csv_file string, remove whitespace
table_name = csv_filename.rsplit('/',2)[1]
table_name = re.sub('[^\w+]', '', table_name)
try:
df = pandas.read_csv(csv_file, error_bad_lines=False)
df.to_sql(table_name, conn, if_exists='append', index=False)
if table_name not in added_tables:
# add necessary columns
c.execute('alter table ' + str(table_name) + ' add team_BASEDOWN integer;')
c.execute('alter table ' + str(table_name) + ' add team_FIELDPOSITION integer;')
c.execute('alter table ' + str(table_name) + ' add team_HEADCOACH text;')
c.execute('alter table ' + str(table_name) + ' add team_OFFCOOR text;')
c.execute('alter table ' + str(table_name) + ' add team_DEFFCOOR text;')
added_tables.append(table_name)
# set basedown
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 0 where pff_DOWN = 1 or (pff_DOWN = 2 and pff_DISTANCE <= 6);')
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 1 where pff_DOWN = 2 and pff_DISTANCE >= 7;')
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 2 where pff_DOWN = 3 and pff_DISTANCE <= 2;')
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 3 where pff_DOWN = 3 and pff_DISTANCE = 3;')
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 4 where pff_DOWN = 3 and pff_DISTANCE >= 4 and pff_DISTANCE <= 6;')
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 5 where pff_DOWN = 3 and pff_DISTANCE >= 7;')
c.execute('update ' + str(table_name) + ' set team_BASEDOWN = 6 where pff_DOWN = 4;')
# set fieldposition
c.execute('update ' + str(table_name) + ' set team_FIELDPOSITION = 0 where pff_FIELDPOSITION <= -1 and pff_FIELDPOSITION >= -10;')
c.execute('update ' + str(table_name) + ' set team_FIELDPOSITION = 1 where pff_FIELDPOSITION <= -11 or (pff_FIELDPOSITION >= 20 and pff_FIELDPOSITION <= 50);')
c.execute('update ' + str(table_name) + ' set team_FIELDPOSITION = 2 where pff_FIELDPOSITION >= 12 and pff_FIELDPOSITION <= 20;')
c.execute('update ' + str(table_name) + ' set team_FIELDPOSITION = 3 where pff_FIELDPOSITION >= 6 and pff_FIELDPOSITION <= 11;')
c.execute('update ' + str(table_name) + ' set team_FIELDPOSITION = 4 where pff_FIELDPOSITION <= 5;')
except pandas.errors.EmptyDataError as ex:
print(str(csv_file) + ' was empty; continuing...')
continue;
conn.commit()
conn.close()
views.py
from django.shortcuts import render
from django.db import connections
from django.db.utils import OperationalError
from django.http import HttpResponse
from django.template import loader
from django.conf import settings
from django.utils.encoding import smart_str
from webapp.update_db import upload_files
from threading import Thread
import numpy as np
import zipfile
def upload(request):
if request.method == 'POST' and request.FILES['myfile']:
myfile = request.FILES['myfile']
if str(myfile.name).endswith('.zip'):
unzipped = zipfile.ZipFile(myfile)
upload_files(unzipped)
return render(request, 'webapp/upload.html')
我的问题是,当我提交zip时,上传需要花费非常长的时间来处理(处理160MB的zip文件需要大约12个小时)。我觉得SQL查询可能更有效率,但他说,当他在本地运行它时,只需要大约45分钟来构建整个数据库(这比我们期望的大得多,并且#34;更新&#34;)所以我想知道运行应用程序的EC2实例是否存在奇怪的事情。我检查了实例上的CPU利用率,并且在更新脚本运行期间显示了20%的一致平均值(没有值得注意的峰值或谷值)。我不确定在本地运行实例与在EC2上运行实例之间会发生什么变化,因此任何有关修改实例或脚本以提高性能的建议都将非常受欢迎。
答案 0 :(得分:0)
您可以使用一个命令更新所有案例的两列:
UPDATE MyTable
SET team_BASEDOWN = CASE
WHEN pff_DOWN = 1 or (pff_DOWN = 2 and pff_DISTANCE <= 6) THEN 0
WHEN pff_DOWN = 2 and pff_DISTANCE >= 7 THEN 1
WHEN pff_DOWN = 3 and pff_DISTANCE <= 2 THEN 2
WHEN pff_DOWN = 3 and pff_DISTANCE = 3 THEN 3
WHEN pff_DOWN = 3 and pff_DISTANCE >= 4 and pff_DISTANCE <= 6 THEN 4
WHEN pff_DOWN = 3 and pff_DISTANCE >= 7 THEN 5
WHEN pff_DOWN = 4 THEN 6
ELSE team_BASEDOWN
END,
team_FIELDPOSITION = CASE
WHEN pff_FIELDPOSITION <= -1 and pff_FIELDPOSITION >= -10 THEN 0
WHEN pff_FIELDPOSITION <= -11 or (pff_FIELDPOSITION >= 20 and pff_FIELDPOSITION <= 50) THEN 1
WHEN pff_FIELDPOSITION >= 12 and pff_FIELDPOSITION <= 20 THEN 2
WHEN pff_FIELDPOSITION >= 6 and pff_FIELDPOSITION <= 11 THEN 3
WHEN pff_FIELDPOSITION <= 5 THEN 4
ELSE team_FIELDPOSITION
END;