我正在尝试查询s3存储桶中的数据集,在boto3函数的帮助下通过python脚本使用Athena查询。
我正在使用 start_query_execution()来运行我的查询。这是完美的执行,然后在我的python脚本中获得结果,以便我可以访问查询的结果我正在使用函数 get_query_results()。
现在如果我分别运行这两个函数(一个接一个脚本),我得到的数据是Athena查询的输出。我希望它们用一个脚本编写 - 比如,从s3获取数据并开始使用python代码操作查询输出。
由于查询本质上是asyn
,我使用池技术,等待执行Athena查询。但是,如果我运行以下代码,则状态显示正在运行查询。
我认为我正在做一些愚蠢的错误,好像我单独运行它我得到了所需的输出。简而言之,我想使用Athena查询s3中存在的数据,然后对python script
中的这个获取数据进行一些处理,因此这种方法。请帮忙
以下是示例代码
#!/usr/bin/env python3
import boto3
import time
from functools import partial
from multiprocessing.dummy import Pool
pool = Pool(processes=1)
# def async_function(name):
# time.sleep(1)
# return name
#
# def callback_function(name, age):
# print(name, age)
def run_query(query, database, s3_output):
client = boto3.client('athena')
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output,
}
)
print('Execution ID: ' + response['QueryExecutionId'])
return response
def show_res(res, q):
client = boto3.client('athena')
print("Executing query: %s" % (q))
print('Execution ID: ' + res['QueryExecutionId'])
# response = client.stop_query_execution(
# QueryExecutionId=res['QueryExecutionId']
# )
response = client.get_query_results(
# QueryExecutionId='f3642735-d9d9-4246-ade4-7453eaed0717'
QueryExecutionId=res['QueryExecutionId']
)
print("Executing query: %s" % (q))
print('Execution ID: ' + res['QueryExecutionId'])
print('rRespone:'.join(str(x) for x in response['ResultSet']['Rows']));
return response
# for age, name in enumerate(['jack', 'jill', 'james']):
# new_callback_function = partial(callback_function, age=age)
# pool.apply_async(
# async_function,
# args=[name],
# callback=new_callback_function
# )
#Athena configuration
s3_input = 's3://dummy/'
s3_ouput = 's3://dummy/results/'
database = 'dummy'
table = 'dummy'
#Query definitions
query_1 = "SELECT * FROM %s.%s where sex = 'F';" % (database, table)
query_2 = "SELECT * FROM %s.%s where age > 30;" % (database, table)
#Execute all queries
queries = [ query_1 ]
for q in queries:
print("Executing query: %s" % (q))
new_callback_function = partial(show_res, q=q)
pool.apply_async(
run_query,
args=[q, database, s3_ouput],
callback=new_callback_function
)
pool.close()
pool.join()
答案 0 :(得分:0)
代替使用 apply_async 尝试:
pool = Pool(cores)
df = pd.concat(pool.map(func, [value_1,...,value_n]))
pool.close()
pool.join()
我写了我的代码,说它很好用,我希望您可以重用一些代码行。基本上,我在“相同”时间在Athena中运行多个查询(我并行化了名为endpoints的数组),并将每个结果存储在Pandas数据帧的一行中。另外,您可以获取每个查询的数据,并且添加了状态打印,然后可以查看每个查询的状态。请记住,Athena具有可以同时运行的查询限制。
import time
import boto3
import pandas as pd
from multiprocessing import Pool
class QueryAthena:
def __init__(self, endpoint, init_date, end_date):
self.s3_input = 's3://my_bucket/input'
self.s3_output = 's3://my_bucket/output'
self.database = 'datalake'
self.table = 'my_table'
self.endpoint = "'" + endpoint + "'"
self.init_date = "'" + init_date + "'"
self.end_date = "'" + end_date + "'"
self.year = self.init_date[1:5]
self.month = self.init_date[6:8]
self.day = self.init_date[9:11]
self.region_name = 'us-east-1'
self.aws_access_key_id = "my_id"
self.aws_secret_access_key = "my_key"
def load_conf(self, q):
self.client = boto3.client('athena',
region_name = self.region_name,
aws_access_key_id = self.aws_access_key_id,
aws_secret_access_key= self.aws_secret_access_key)
try:
response = self.client.start_query_execution(
QueryString = q,
QueryExecutionContext={
'Database': self.database
},
ResultConfiguration={
'OutputLocation': self.s3_output,
}
)
print('Execution ID: ' + response['QueryExecutionId'])
except Exception as e:
print(e)
return response
def query(self):
self.query = "SELECT count(*) as total_requests, SUM(CASE WHEN count_endpoints > 1 THEN 1 ELSE 0 END) as total_repeated, AVG(CASE WHEN count_endpoints > 1 THEN count_endpoints END) as TRAFFIC_QUALITY FROM (SELECT * from (SELECT domain, size, device_id, ip, array_join(array_agg(distinct endpoint), ',') as endpoints_all, count(distinct endpoint) as count_endpoints FROM %s.%s WHERE year=%s and month=%s and day=%s and ts between timestamp %s and timestamp %s and status = '2' GROUP BY domain, size, device_id, ip) l1 where endpoints_all LIKE '%%' || %s || '%%') l2;" % (self.database, self.table, self.year, self.month, self.day, self.init_date, self.end_date, self.endpoint)
def run_query(self):
self.query()
queries = [self.query]
for q in queries:
#print("Executing query: %s" % (q))
res = self.load_conf(q)
try:
query_status = None
while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
print(query_status + " " + self.endpoint)
if query_status == 'FAILED' or query_status == 'CANCELLED':
raise Exception('Athena query with the string "{}" failed or was cancelled'.format(query_string))
time.sleep(20)
print("Query %s finished." % (self.endpoint))
response = self.client.get_query_results(QueryExecutionId=res['QueryExecutionId'])
df = self.results_to_df(response)
df = pd.DataFrame(df)
df["endpoint"] = str(self.endpoint)
try:
df["percentaje_repeated"] = str(int(df["total_repeated"].iloc[0]) * 100 / int(df["total_requests"].iloc[0]))
except Exception as e:
print(self.endpoint + " here")
df["date"] = str(self.init_date + "-" + self.end_date)
return df
except Exception as e:
print(e + " " + endpoint)
print(df["total_repeated"].iloc[0])
print(df["total_requests"].iloc[0])
def results_to_df(self, results):
columns = [
col['Label']
for col in results['ResultSet']['ResultSetMetadata']['ColumnInfo']
]
listed_results = []
for res in results['ResultSet']['Rows'][1:]:
values = []
for field in res['Data']:
try:
values.append(list(field.values())[0])
except:
values.append(list(' '))
listed_results.append(
dict(zip(columns, values))
)
return listed_results
def func(end):
qa = QueryAthena(end, "2018-10-09 00:00:00", "2018-10-09 05:59:59")
result = qa.run_query()
return result
endpoints = ["677SRI149821","V14509674","1426R"]
if __name__ == '__main__':
pool = Pool(15)
df = pd.concat(pool.map(func, endpoints))
pool.close()
pool.join()