Question

我正在做以下事情。

使用查询按组查询大型表以进行值汇总。
通过例程运行这些记录以添加一些其他数据
将它们有效地插入数据库。

我试图使用pg-query-stream来做到这一点，以数据流的形式读取数据，然后将这些记录按批次计数，例如一次达到1000次，一旦达到批量限制，就可以使用pg-promise pgp.helpers.insert插入数据。

我的问题是我无法完全弄清楚如何使流正确暂停以使插入在继续之前完成。特别是在on.end（）

我尝试过的代码如下

const { performance } = require('perf_hooks')
const QueryStream = require('pg-query-stream')

const batchInsertData = (tenant, stream, records, insertColumnSet, options = {}) => {
  stream.pause()
  const t0 = performance.now()
  let query = tenant.db.$config.pgp.helpers.insert(records, insertColumnSet)

  if (options.onConflictExpression) {
    query += options.onConflictExpression
  }

  tenant.db.none(query)
    .then(() => {
      let t1 = performance.now()
      console.log('Inserted ' + records.length + ' records done in ' + ((t1 - t0) / 1000) + ' (seconds).')
      stream.resume()
    })
    .catch(error => {
      throw error
    })
}

module.exports = (tenant, sql, columnSet, recordMapper, options = {}) => {
  try {
    return new Promise((resolve, reject) => {
      const query = new QueryStream(sql)

      // Set options as required
      options.batchSize = parseInt(options.batchSize) || 1000
      options.onConflictExpression = options.onConflictExpression || null

      let records = []
      let batchNumber = 1
      let recordCount = 0

      let t0 = performance.now()
      tenant.db.stream(query, (stream) => {
        stream.on('data', (record) => {
          const mappedRecord = recordMapper(record)
          records.push(mappedRecord)
          recordCount++

          if (records.length === options.batchSize) {
            batchInsertData(tenant, stream, records, columnSet, options)
            records = []
            console.log(`Batch ${batchNumber} done`)
            batchNumber++
          }
        })
        stream.on('end', () => {
        // If any records are left that are not part of a batch insert here.
          if (records.length !== 0) {
            batchInsertData(tenant, stream, records, columnSet, options)
            records = []
            console.log(`Batch ${batchNumber} done`)
            batchNumber++
            console.log('Total Records: ' + recordCount)
            let t1 = performance.now()
            console.log('Duration:', ((t1 - t0) / 1000) + ' (seconds).')
          } else {
            console.log('Total Records: ' + recordCount)
            let t1 = performance.now()
            console.log('Duration:', ((t1 - t0) / 1000) + ' (seconds).')
          }
        })
        stream.on('error', (error) => {
          throw error
        })
      })
        .then(data => {
          resolve()
        })
        .catch(error => {
          console.log('ERROR:', error)
          reject(error)
        })
    })
  } catch (err) {
    throw err
  }
}

我不确定我尝试的方法是否是最佳方法。根据我在pg-promise和stream周围可以找到的文档，我尝试了几种不同的方法，但是没有任何乐趣。

任何帮助/建议都将不胜感激。

谢谢

保罗

尝试2

以下是我第二次尝试按照数据导入页面使用getNextData和sequence的尝试。努力确定如何将流挂接到其中，以便在插入之前一次仅提取一批数据。

const { performance } = require('perf_hooks')
const QueryStream = require('pg-query-stream')

module.exports = (tenant, sql, columnSet, recordMapper, options = {}) => {

  try {
    // Set options as required
    options.batchSize = parseInt(options.batchSize) || 1000
    options.onConflictExpression = options.onConflictExpression || null

    const query = new QueryStream(sql)

    function getNextData(transaction, index) {
      return new Promise(async (resolve, reject) => {
        if (index < options.batchSize) {
          let count = 1
          await transaction.stream(query, async (stream) => {
            let records = []
            await tenant.db.$config.pgp.spex.stream.read.call(transaction, stream, function (streamIndex, streamData) {  
              stream.resume()
              count++
              console.log(count, streamIndex, streamData)        

              records.push(streamData[0])

              if (records.length === options.batchSize) {
                stream.pause()
                resolve(records)
              }
            }, {readChunks: true})

          })  
        }
        resolve(null)
      })
    }

    return tenant.db.tx('massive-insert', (transaction) => {
      return transaction.sequence((index) => {          
        return getNextData(transaction, index)
          .then((records) => {
            if (records > 0) {
              let query = tenant.db.$config.pgp.helpers.insert(records, columnSet)

              if (options.onConflictExpression) {
                query += options.onConflictExpression
              }

              const i0 = performance.now()
              return transaction.none(query)
                .then(() => {
                  let i1 = performance.now()
                  console.log('Inserted ' + records.length + ' records done in ' + ((i1 - i0) / 1000) + ' (seconds).')
                })
            }
          })
      })
    })
  } catch (err) {
    throw err
  }
}

Answer 1

我使用了一种略有不同的方法来进行这项工作，该方法更侧重于直接使用流，同时仍然使用pg-promise来处理数据库方面。

const BatchStream = require('batched-stream')
const { performance } = require('perf_hooks')
const { Transform, Writable } = require('stream')

module.exports = async (tenant, sql, columnSet, recordMapper, options = {}) => {

  try {
    // Set options as required
    options.batchSize = parseInt(options.batchSize) || 1000
    options.onConflictExpression = options.onConflictExpression || null

    const query = new tenant.lib.QueryStream(sql)

    const stream = tenant.db.client.query(query)

    return new Promise((resolve, reject) => {
      // We want to process this in batches
      const batch = new BatchStream({size : options.batchSize, objectMode: true, strictMode: false})

      // We use a write stream to insert the batch into the database
      let insertDatabase = new Writable({
        objectMode: true,
        write(records, encoding, callback) {
          (async () => {

            try {
              /*
                If we have a record mapper then do it here prior to inserting the records.
                This way is much quicker than doing it as a transform stream below by
                about 10 seconds for 100,000 records
              */
              if (recordMapper) {
                records = records.map(record => recordMapper(record))
              }

              let query = tenant.lib.pgp.helpers.insert(records, columnSet)

              if (options.onConflictExpression) {
                query += options.onConflictExpression
              }

              const i0 = performance.now()
              await tenant.db.none(query)
                .then(() => {
                  let i1 = performance.now()
                  console.log('Inserted ' + records.length + ' records in ' + ((i1 - i0) / 1000) + ' (seconds).')
                })

            } catch(e) {
              return callback(e)
            }

            callback()
          })()
        }
      })

      // Process the stream
      const t0 = performance.now()
      stream
        // Break it down into batches
        .pipe(batch)
        // Insert those batches into the database
        .pipe(insertDatabase)
        // Once we get here we are done :)
        .on('finish', () => {
          const t1 = performance.now()
          console.log('Finished insert in ' + ((t1 - t0) / 1000) + ' (seconds).')
          resolve()
        })
        .on('error', (error) => {
          reject(error)
        })

    })
  } catch (err) {
    throw err
  }
}

使用PG-Promise和PG-Query-Stream有效读取，处理和插入数据

1 个答案: