大家都是Python熊猫大师。我正在寻找一种与Python并行运行SQL的方法,返回几个Pandas数据帧。我有类似于下面的代码,对MS SQL服务器数据库连续运行4个SQL查询。其中两个查询的执行时间要长得多,而IO(网络)时间要长得多,因此我认为并行化会使代码运行速度提高约2倍。有没有一种简单的方法可以并行执行查询?
理想情况下,我希望能够读取项目子目录中的所有* .sql文件,然后触发查询并行运行并以易于使用的格式返回四个数据帧(列表? )进一步的操作(索引,加入,聚合)。
提前致谢, 兰德尔
# imports
import ceODBC
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from ConfigParser import ConfigParser
import os
import glob
# db connection string
cnxn = 'DRIVER={SQL Server Native Client 11.0}; SERVER=<servername>; DATABASE=<dname>; Trusted_Connection=Yes'
# directories (also should be moved to config)
dataDir = os.getcwd() + '\\data\\'
sqlDir = os.getcwd() + '\\sql\\'
# read sql from external .sql files. Possible to read all *.sql files in a sql dir into a list (or other structure...)?
with open(sqlDir + 'q1.sql', 'r') as f: q1sql = f.read()
with open(sqlDir + 'q2.sql', 'r') as f: q2sql = f.read()
with open(sqlDir + 'q3.sql', 'r') as f: q3sql = f.read()
with open(sqlDir + 'q4.sql', 'r') as f: q4sql = f.read()
# Connect to db, run SQL, assign result into dataframe, close connection.
cnxn = ceODBC.connect(cnxn)
cursor = cnxn.cursor()
# execute the queries and close the connection. Parallelize?
df1 = psql.frame_query(q1sql, cnxn)
df2 = psql.frame_query(q2sql, cnxn)
df3 = psql.frame_query(q3sql, cnxn)
df4 = psql.frame_query(q4sql, cnxn)
# close connection
cnxn.close()
答案 0 :(得分:2)
在N个线程中使用N个连接。然后加入theads并获得结果。
# imports
import ceODBC
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from ConfigParser import ConfigParser
import os
import glob
import threading
enter code here
# db connection string
cnxn_string = 'DRIVER={SQL Server Native Client 11.0}; SERVER=<servername>; DATABASE=<dname>; Trusted_Connection=Yes'
# directories (also should be moved to config)
dataDir = os.getcwd() + '\\data\\'
sqlDir = os.getcwd() + '\\sql\\'
#variable to store results
responses={}
responses_lock=threading.Lock()
maxconnections = 8
pool_sema = BoundedSemaphore(value=maxconnections)
def task(fname):
with open(fname, 'r') as f: sql = f.read()
# Connect to db, run SQL, assign result into dataframe, close connection.
# to limit connections on DB used semaphore
pool_sema.acquire()
cnxn = ceODBC.connect(cnxn_string)
cursor = cnxn.cursor()
# execute the queries and close the connection. Parallelize?
df = psql.frame_query(sql, cnxn)
# close connection
cnxn.close()
pool_sema.release()
# to ensure that only one thread can modify global variable
responses_lock.acquire()
responses[fname] = df
responses_lock.release()
pool = []
#find sql files and spawn theads
for fname im glob.glob( os.path.join(sqlDir,'*sql')):
#create new thread with task
thread = threading.Thread(target=task,args=(fname,))
thread.daemon = True
# store thread in pool
pool.append(thread)
#thread started
thread.start()
#wait for all threads tasks done
for thread in pool:
thread.join()
# results of each execution stored in responses dict
每个文件都在单独的线程中执行。结果存储在一个变量中。
与with
语句的函数等效:
def task(fname):
with open(fname, 'r') as f: sql = f.read()
# Connect to db, run SQL, assign result into dataframe, close connection.
# to limit connections on DB used semaphore
with pool_sema:
cnxn = ceODBC.connect(cnxn_string)
cursor = cnxn.cursor()
# execute the queries and close the connection. Parallelize?
df = psql.frame_query(sql, cnxn)
# close connection
cnxn.close()
# to ensure that only one thread can modify global variable
with responses_lock:
responses[fname] = df
multiprocessing.Pool
很容易分发繁重的任务,但其中有更多的IO操作。