从实验上看,线程化数百个“小型” SQL查询比运行一个较大的查询要快得多。
下面的代码有效,但是我想知道是否有人有优化它的提示?在大约500个查询中运行它并返回总计1000万行时,CPU使用率在90-100%之间波动。
# 1. Import standard modules.
# 2. FUNCTION - Establish multiple db connections.
# 3. FUNCTION - Execute multiple queries using multiple db connections from #2.
# 4. FUNCTION - Close db connections from #2.
# 5. FUNCTION - Use #2 to establish multiple db connections, #3 to execute multiple queries, #4 to close db connections.
import threading as th
import pyodbc
import pandas as pd
pyodbc.pooling = False
def connect(connection_string , num_queries):
connections , threads = [] , []
def myfunc(i):
connection = pyodbc.connect(connection_string)
connections.append(connection)
for i in range(num_queries):
t = th.Thread(target=myfunc , args=(i,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
return connections
def concurrent(queries , connections):
df , threads = [] , []
num_queries = len(queries)
def myfunc(i):
df.append(pd.read_sql_query(queries[i] , connections[i]))
for i in range(num_queries):
t = th.Thread(target=myfunc , args=(i,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
return pd.concat(df)
def close(connections):
threads = []
def myfunc(i):
i.close()
for i in connections:
t = th.Thread(target=myfunc , args=(i,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
for i in reversed(connections):
connections.remove(i)
def query(queries , connection_string):
num_queries = len(queries)
connections = connect(connection_string , num_queries)
df = concurrent(queries , connections)
close(connections)
return df
if __name__ == "__main__":
queries = ['SELECT * FROM TBL_1' , 'SELECT * FROM TBL_2' , ...]
connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER = ...'
query(queries , connection_string)