使用AWS Athena-Boto3从PYTHON脚本获取数据时遇到问题

时间:2018-04-16 10:29:09

标签: python amazon-web-services asynchronous boto3 amazon-athena

我正在尝试查询s3存储桶中的数据集,在boto3函数的帮助下通过python脚本使用Athena查询。

我正在使用 start_query_execution()来运行我的查询。这是完美的执行,然后在我的python脚本中获得结果,以便我可以访问查询的结果我正在使用函数 get_query_results()

现在如果我分别运行这两个函数(一个接一个脚本),我得到的数据是Athena查询的输出。我希望它们用一个脚本编写 - 比如,从s3获取数据并开始使用python代码操作查询输出。

由于查询本质上是asyn,我使用池技术,等待执行Athena查询。但是,如果我运行以下代码,则状态显示正在运行查询。

我认为我正在做一些愚蠢的错误,好像我单独运行它我得到了所需的输出。简而言之,我想使用Athena查询s3中存在的数据,然后对python script中的这个获取数据进行一些处理,因此这种方法。请帮忙

以下是示例代码

#!/usr/bin/env python3

import boto3
import time
from functools import partial
from multiprocessing.dummy import Pool
pool = Pool(processes=1)

# def async_function(name):
#     time.sleep(1)
#     return name
#
# def callback_function(name, age):
#     print(name, age)

def run_query(query, database, s3_output):
    client = boto3.client('athena')
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
            },
        ResultConfiguration={
            'OutputLocation': s3_output,
            }
        )
    print('Execution ID: ' + response['QueryExecutionId'])
    return response
def show_res(res, q):
    client = boto3.client('athena')
    print("Executing query: %s" % (q))
    print('Execution ID: ' + res['QueryExecutionId'])
    # response = client.stop_query_execution(
    #     QueryExecutionId=res['QueryExecutionId']
    # )
    response = client.get_query_results(
        # QueryExecutionId='f3642735-d9d9-4246-ade4-7453eaed0717'
        QueryExecutionId=res['QueryExecutionId']
        )
    print("Executing query: %s" % (q))
    print('Execution ID: ' + res['QueryExecutionId'])
    print('rRespone:'.join(str(x) for x in response['ResultSet']['Rows']));
    return response

# for age, name in enumerate(['jack', 'jill', 'james']):
#     new_callback_function = partial(callback_function, age=age)
#     pool.apply_async(
#         async_function,
#         args=[name],
#         callback=new_callback_function
#     )

#Athena configuration
s3_input = 's3://dummy/'
s3_ouput = 's3://dummy/results/'
database = 'dummy'
table = 'dummy'

#Query definitions
query_1 = "SELECT * FROM %s.%s where sex = 'F';" % (database, table)
query_2 = "SELECT * FROM %s.%s where age > 30;" % (database, table)
#Execute all queries
queries = [ query_1 ]
for q in queries:
    print("Executing query: %s" % (q))
    new_callback_function = partial(show_res, q=q)
    pool.apply_async(
        run_query,
        args=[q, database, s3_ouput],
        callback=new_callback_function
    )

pool.close()
pool.join()

1 个答案:

答案 0 :(得分:0)

代替使用 apply_async 尝试:

pool = Pool(cores)
df = pd.concat(pool.map(func, [value_1,...,value_n]))
pool.close()
pool.join()

我写了我的代码,说它很好用,我希望您可以重用一些代码行。基本上,我在“相同”时间在Athena中运行多个查询(我并行化了名为endpoints的数组),并将每个结果存储在Pandas数据帧的一行中。另外,您可以获取每个查询的数据,并且添加了状态打印,然后可以查看每个查询的状态。请记住,Athena具有可以同时运行的查询限制。

import time
import boto3
import pandas as pd 
from multiprocessing import Pool

class QueryAthena:

    def __init__(self, endpoint, init_date, end_date):
        self.s3_input = 's3://my_bucket/input'
        self.s3_output =  's3://my_bucket/output'
        self.database = 'datalake'
        self.table = 'my_table'
        self.endpoint = "'" + endpoint + "'"
        self.init_date = "'" + init_date + "'"
        self.end_date = "'" + end_date + "'"
        self.year = self.init_date[1:5]
        self.month = self.init_date[6:8]
        self.day =  self.init_date[9:11]
        self.region_name = 'us-east-1'
        self.aws_access_key_id = "my_id"
        self.aws_secret_access_key = "my_key"

    def load_conf(self, q):
        self.client = boto3.client('athena', 
                              region_name = self.region_name, 
                              aws_access_key_id = self.aws_access_key_id,
                              aws_secret_access_key= self.aws_secret_access_key)
        try:
            response = self.client.start_query_execution(
                QueryString = q,
                    QueryExecutionContext={
                    'Database': self.database
                    },
                    ResultConfiguration={
                    'OutputLocation': self.s3_output,
                    }
            )
            print('Execution ID: ' + response['QueryExecutionId'])
        except Exception as e:
            print(e)
        return response

    def query(self):
        self.query = "SELECT count(*) as total_requests, SUM(CASE WHEN count_endpoints > 1 THEN 1 ELSE 0 END) as total_repeated, AVG(CASE WHEN count_endpoints > 1 THEN count_endpoints END) as TRAFFIC_QUALITY FROM (SELECT * from (SELECT  domain, size, device_id, ip, array_join(array_agg(distinct endpoint), ',') as endpoints_all, count(distinct endpoint) as count_endpoints FROM %s.%s WHERE year=%s and month=%s and day=%s and ts between timestamp %s and timestamp %s and status = '2' GROUP BY domain, size, device_id, ip) l1 where endpoints_all LIKE '%%' || %s || '%%') l2;" % (self.database, self.table, self.year, self.month, self.day, self.init_date, self.end_date, self.endpoint)

    def run_query(self):
        self.query()
        queries = [self.query]
        for q in queries:
            #print("Executing query: %s" % (q))
            res = self.load_conf(q)

        try:    
            query_status = None
            while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                print(query_status + " " + self.endpoint)
                if query_status == 'FAILED' or query_status == 'CANCELLED':
                    raise Exception('Athena query with the string "{}" failed or was cancelled'.format(query_string))
                time.sleep(20)
            print("Query %s finished." % (self.endpoint))

            response = self.client.get_query_results(QueryExecutionId=res['QueryExecutionId'])            
            df = self.results_to_df(response)
            df = pd.DataFrame(df)
            df["endpoint"] = str(self.endpoint)
            try:
                df["percentaje_repeated"] = str(int(df["total_repeated"].iloc[0]) * 100 / int(df["total_requests"].iloc[0]))
            except Exception as e:
                print(self.endpoint + " here")
            df["date"] = str(self.init_date + "-" + self.end_date)
            return df

        except Exception as e:
            print(e + " " + endpoint)
            print(df["total_repeated"].iloc[0])
            print(df["total_requests"].iloc[0])

    def results_to_df(self, results):
        columns = [
            col['Label']
            for col in results['ResultSet']['ResultSetMetadata']['ColumnInfo']
        ]

        listed_results = []
        for res in results['ResultSet']['Rows'][1:]:
            values = []
            for field in res['Data']:
                try:
                    values.append(list(field.values())[0]) 
                except:
                    values.append(list(' '))

            listed_results.append(
                dict(zip(columns, values))
            )

        return listed_results        

def func(end):
    qa = QueryAthena(end, "2018-10-09 00:00:00", "2018-10-09 05:59:59")
    result = qa.run_query()
    return result

endpoints = ["677SRI149821","V14509674","1426R"]

if __name__ == '__main__':
    pool = Pool(15)
    df = pd.concat(pool.map(func, endpoints))
    pool.close()
    pool.join()