Question

我有一个脚本，它从AWS Athena中提取了25,000条记录，这些记录基本上是一个PrestoDB关系型SQL数据库。让我们说我正在为这些记录中的每一个生成请求，这意味着我必须向Athena发出25,000个请求，然后当数据返回时，我必须向我的Redis群集发出25,000个请求。

从节点到Athena的一次请求的理想数量是多少？

我问的原因是因为我尝试通过创建一个包含25,000个承诺的数组然后在其上调用Promise.all(promiseArray)来实现此目的，但该应用程序只是永远挂起。

所以我决定一次触发1并使用递归来拼接第一个索引，然后在解析了promise之后将剩余的记录传递给调用函数。

这个问题是它需要永远。我花了大约一个小时休息回来，剩下23,000条记录。

我试图谷歌一次有多少请求Node和Athena可以处理，但我什么也没想到。我希望有人可能对此有所了解，并能够与我分享。

谢谢。

这是我的代码仅供参考：

作为旁注，我想做的不同之处在于，我可以一次发送一个请求，而不是一次发送4,5,6,7或8个，具体取决于它的执行速度。

此外，Node集群如何影响这样的性能？

exports.storeDomainTrends = () => {
return new Promise((resolve, reject)=>{
    athenaClient.execute(`SELECT DISTINCT the_column from "the_db"."the_table"`,
    (err, data) =>  {
        var getAndStoreDomainData = (records) => {
            if(records.length){
                return new promise((resolve, reject) => {
                    var subrecords = records.splice(0, )[0]
                    athenaClient.execute(`
                    SELECT 
                    field,
                    field,
                    field,
                    SUM(field) as field
                    FROM "the_db"."the_table"
                    WHERE the_field IN ('Month') AND the_field = '`+ record.domain_name +`'
                    GROUP BY the_field, the_field, the_field
                    `, (err, domainTrend) => {

                        if(err) {
                            console.log(err)
                            reject(err)
                        }

                        redisClient.set(('Some String' + domainTrend[0].domain_name), JSON.stringify(domainTrend))
                        resolve(domainTrend);
                    })
                })
                .then(res => {
                    getAndStoreDomainData(records);
                })
            }
        }

        getAndStoreDomainData(data);

    })
})

}

Answer 1

使用lib您的代码可能如下所示：

const Fail = function(reason){this.reason=reason;};
const isFail = x=>(x&&x.constructor)===Fail;
const distinctDomains = () =>
  new Promise(
    (resolve,reject)=>
      athenaClient.execute(
        `SELECT DISTINCT domain_name from "endpoint_dm"."bd_mb3_global_endpoints"`,
        (err,data)=>
          (err)
            ? reject(err)
            : resolve(data)
      )
  );
const domainDetails = domain_name =>
  new Promise(
    (resolve,reject)=>
      athenaClient.execute(
        `SELECT 
        timeframe_end_date,
        agg_type,
        domain_name,
        SUM(endpoint_count) as endpoint_count
        FROM "endpoint_dm"."bd_mb3_global_endpoints"
        WHERE agg_type IN ('Month') AND domain_name = '${domain_name}'
        GROUP BY timeframe_end_date, agg_type, domain_name`,
        (err, domainTrend) =>
            (err)
              ? reject(err)
              : resolve(domainTrend)
        )
  );
const redisSet = keyValue =>
  new Promise(
    (resolve,reject)=>
      redisClient.set(
        keyValue,
        (err,res)=>
          (err)
            ? reject(err)
            : resolve(res)
      )
  );
const process = batchSize => limitFn => resolveValue => domains => 
  Promise.all(
    domains.slice(0,batchSize)
    .map(//map domains to promises
      domain=>
        //maximum 5 active connections
        limitFn(domainName=>domainDetails(domainName))(domain.domain_name)
        .then(
          domainTrend=>
            //the redis client documentation makes no sense whatsoever
            //https://redis.io/commands/set
            //no mention of a callback
            //https://github.com/NodeRedis/node_redis
            //mentions a callback, since we need the return value
            //and best to do it async we will use callback to promise
            redisSet([
              `Endpoint Profiles - Checkin Trend by Domain - Monthly - ${domainTrend[0].domain_name}`,
              JSON.stringify(domainTrend)
            ])
        )
        .then(
          redisReply=>{
            //here is where things get unpredictable, set is documented as 
            //  a synchronous function returning "OK" or a function that
            //  takes a callback but no mention of what that callback recieves
            //  as response, you should try with one or two records to
            //  finish this on reverse engineering because documentation
            //  fails 100% here and can not be relied uppon.
            console.log("bad documentation of redis client... reply is:",redisReply);
            (redisReply==="OK")
              ? domain
              : Promise.reject(`Redis reply not OK:${redisReply}`)
          }
        )
        .catch(//catch failed, save error and domain of failed item
          e=>
            new Fail([e,domain])
        )
    )
  ).then(
    results=>{
      console.log(`got ${batchSize} results`);
      const left = domains.slice(batchSize);
      if(left.length===0){//nothing left
        return resolveValue.conat(results);
      }
      //recursively call process untill done
      return process(batchSize)(limitFn)(resolveValue.concat(results))(left)
    }
  );
const max5 = lib.throttle(5);//max 5 active connections to athena
distinctDomains()//you may want to limit the results to 50 for testing
//you may want to limit batch size to 10 for testing
.then(process(1000)(max5)([]))//we have 25000 domains here
.then(
  results=>{//have 25000 results
    const successes = results.filter(x=>!isFail(x));
    //array of failed items, a failed item has a .reason property
    //  that is an array of 2 items: [the error, domain]
    const failed = results.filter(isFail);
  }
)

你应该弄清楚redis客户端做了什么，我试图用文档来解决它，但也可以问我的金鱼。一旦您反向设计了客户端行为，最好尝试使用小批量大小来查看是否存在任何错误。您必须导入lib才能使用它，您可以找到它here。

Answer 2

我能够采取Kevin B所说的找到更快捷的方式来查询数据。我所做的是更改查询，以便我可以从Athena获取所有域的趋势。我通过domain_name对它进行了排序，然后将其作为节点流发送，以便我可以在数据进入时将每个域名分离为其自己的JSON。

无论如何，这就是我最终的结果。

exports.storeDomainTrends = () => {
return new Promise((resolve, reject)=>{
    var streamObj = athenaClient.execute(`
    SELECT field,
            field,
            field,
            SUM(field) AS field
    FROM "db"."table"
    WHERE field IN ('Month')
    GROUP BY  field, field, field
    ORDER BY  field desc`).toStream();

    var data = [];

    streamObj.on('data', (record)=>{
        if (!data.length || record.field === data[0].field){
            data.push(record)
        } else if (data[0].field !== record.field){
            redisClient.set(('Key'), JSON.stringify(data))
            data = [record]
        }
    })

    streamObj.on('end', resolve);

    streamObj.on('error', reject);

})
.then()

}

Node-Express可以立即启动多少个请求？

2 个答案: