我有一个脚本,它从AWS Athena中提取了25,000条记录,这些记录基本上是一个PrestoDB关系型SQL数据库。让我们说我正在为这些记录中的每一个生成请求,这意味着我必须向Athena发出25,000个请求,然后当数据返回时,我必须向我的Redis群集发出25,000个请求。
从节点到Athena的一次请求的理想数量是多少?
我问的原因是因为我尝试通过创建一个包含25,000个承诺的数组然后在其上调用Promise.all(promiseArray)
来实现此目的,但该应用程序只是永远挂起。
所以我决定一次触发1并使用递归来拼接第一个索引,然后在解析了promise之后将剩余的记录传递给调用函数。
这个问题是它需要永远。我花了大约一个小时休息回来,剩下23,000条记录。
我试图谷歌一次有多少请求Node和Athena可以处理,但我什么也没想到。我希望有人可能对此有所了解,并能够与我分享。
谢谢。
这是我的代码仅供参考:
作为旁注,我想做的不同之处在于,我可以一次发送一个请求,而不是一次发送4,5,6,7或8个,具体取决于它的执行速度。
此外,Node集群如何影响这样的性能?
exports.storeDomainTrends = () => {
return new Promise((resolve, reject)=>{
athenaClient.execute(`SELECT DISTINCT the_column from "the_db"."the_table"`,
(err, data) => {
var getAndStoreDomainData = (records) => {
if(records.length){
return new promise((resolve, reject) => {
var subrecords = records.splice(0, )[0]
athenaClient.execute(`
SELECT
field,
field,
field,
SUM(field) as field
FROM "the_db"."the_table"
WHERE the_field IN ('Month') AND the_field = '`+ record.domain_name +`'
GROUP BY the_field, the_field, the_field
`, (err, domainTrend) => {
if(err) {
console.log(err)
reject(err)
}
redisClient.set(('Some String' + domainTrend[0].domain_name), JSON.stringify(domainTrend))
resolve(domainTrend);
})
})
.then(res => {
getAndStoreDomainData(records);
})
}
}
getAndStoreDomainData(data);
})
})
}
答案 0 :(得分:1)
使用lib您的代码可能如下所示:
const Fail = function(reason){this.reason=reason;};
const isFail = x=>(x&&x.constructor)===Fail;
const distinctDomains = () =>
new Promise(
(resolve,reject)=>
athenaClient.execute(
`SELECT DISTINCT domain_name from "endpoint_dm"."bd_mb3_global_endpoints"`,
(err,data)=>
(err)
? reject(err)
: resolve(data)
)
);
const domainDetails = domain_name =>
new Promise(
(resolve,reject)=>
athenaClient.execute(
`SELECT
timeframe_end_date,
agg_type,
domain_name,
SUM(endpoint_count) as endpoint_count
FROM "endpoint_dm"."bd_mb3_global_endpoints"
WHERE agg_type IN ('Month') AND domain_name = '${domain_name}'
GROUP BY timeframe_end_date, agg_type, domain_name`,
(err, domainTrend) =>
(err)
? reject(err)
: resolve(domainTrend)
)
);
const redisSet = keyValue =>
new Promise(
(resolve,reject)=>
redisClient.set(
keyValue,
(err,res)=>
(err)
? reject(err)
: resolve(res)
)
);
const process = batchSize => limitFn => resolveValue => domains =>
Promise.all(
domains.slice(0,batchSize)
.map(//map domains to promises
domain=>
//maximum 5 active connections
limitFn(domainName=>domainDetails(domainName))(domain.domain_name)
.then(
domainTrend=>
//the redis client documentation makes no sense whatsoever
//https://redis.io/commands/set
//no mention of a callback
//https://github.com/NodeRedis/node_redis
//mentions a callback, since we need the return value
//and best to do it async we will use callback to promise
redisSet([
`Endpoint Profiles - Checkin Trend by Domain - Monthly - ${domainTrend[0].domain_name}`,
JSON.stringify(domainTrend)
])
)
.then(
redisReply=>{
//here is where things get unpredictable, set is documented as
// a synchronous function returning "OK" or a function that
// takes a callback but no mention of what that callback recieves
// as response, you should try with one or two records to
// finish this on reverse engineering because documentation
// fails 100% here and can not be relied uppon.
console.log("bad documentation of redis client... reply is:",redisReply);
(redisReply==="OK")
? domain
: Promise.reject(`Redis reply not OK:${redisReply}`)
}
)
.catch(//catch failed, save error and domain of failed item
e=>
new Fail([e,domain])
)
)
).then(
results=>{
console.log(`got ${batchSize} results`);
const left = domains.slice(batchSize);
if(left.length===0){//nothing left
return resolveValue.conat(results);
}
//recursively call process untill done
return process(batchSize)(limitFn)(resolveValue.concat(results))(left)
}
);
const max5 = lib.throttle(5);//max 5 active connections to athena
distinctDomains()//you may want to limit the results to 50 for testing
//you may want to limit batch size to 10 for testing
.then(process(1000)(max5)([]))//we have 25000 domains here
.then(
results=>{//have 25000 results
const successes = results.filter(x=>!isFail(x));
//array of failed items, a failed item has a .reason property
// that is an array of 2 items: [the error, domain]
const failed = results.filter(isFail);
}
)
你应该弄清楚redis客户端做了什么,我试图用文档来解决它,但也可以问我的金鱼。一旦您反向设计了客户端行为,最好尝试使用小批量大小来查看是否存在任何错误。您必须导入lib才能使用它,您可以找到它here。
答案 1 :(得分:1)
我能够采取Kevin B所说的找到更快捷的方式来查询数据。我所做的是更改查询,以便我可以从Athena获取所有域的趋势。我通过domain_name对它进行了排序,然后将其作为节点流发送,以便我可以在数据进入时将每个域名分离为其自己的JSON。
无论如何,这就是我最终的结果。
exports.storeDomainTrends = () => {
return new Promise((resolve, reject)=>{
var streamObj = athenaClient.execute(`
SELECT field,
field,
field,
SUM(field) AS field
FROM "db"."table"
WHERE field IN ('Month')
GROUP BY field, field, field
ORDER BY field desc`).toStream();
var data = [];
streamObj.on('data', (record)=>{
if (!data.length || record.field === data[0].field){
data.push(record)
} else if (data[0].field !== record.field){
redisClient.set(('Key'), JSON.stringify(data))
data = [record]
}
})
streamObj.on('end', resolve);
streamObj.on('error', reject);
})
.then()
}