如何在流式传输模式下使用Google语音获取文字计时信息到文本

时间:2018-11-30 11:57:09

标签: google-api google-cloud-speech

我需要在音频流上执行语音识别,而Google语音转文本服务似乎很有前途。经过一些文档和示例的苦苦挣扎之后,我设法使用Google提供的node.js库和OSX上的sox来捕获麦克风音频。

识别工作正常,但是很遗憾,我找不到启用单词计时数据的方法,根据文档,应将enableWordTimeOffsets选项设置为true。

这仅在流传输模式下发生,上传文件的简单示例给出了预期的结果。

那么有人通过流识别成功获得了单词计时数据吗?会不会是Google API的javascript库中的小故障?

谢谢

Davide

PS如果可以提供帮助,则流示例的代码(很脏,这只是一个测试)在这里:

var cp = null
var spawn = require('child_process').spawn

function startRecording (options) {
  cp = null // Empty out possibly dead recording process

  var defaults = {
    sampleRate: 16000,
    channels: 1,
    compress: false,
    threshold: 0,
    thresholdStart: null,
    thresholdEnd: null,
    silence: '1.0',
    verbose: false,
    recordProgram: 'sox'
  }

  options = Object.assign(defaults, options)

  // Capture audio stream
  var cmd, cmdArgs, cmdOptions

  var cmd = 'sox';
  var cmdArgs = [
    '-q',                     // show no progress
    '-d',                     // use default recording device
    '-r', 16000, // sample rate
    '-c', 1,   // channels
    '-t', '.raw',        // audio type ... was waveaudio
    // '-e', 'signed-integer',   // sample encoding
    '-b', '16',               // precision (bits)
    '-',                      // pipe
  ];

  // Spawn audio capture command
  cmdOptions = { encoding: 'binary' }
  if (options.device) {
    cmdOptions.env = Object.assign({}, process.env, { AUDIODEV: options.device })
  }

  cp = spawn(cmd, cmdArgs, cmdOptions)
  var rec = cp.stdout

  if (options.verbose) {
    console.log('Recording', options.channels, 'channels with sample rate',
        options.sampleRate + '...')
    console.time('End Recording')

    rec.on('data', function (data) {
      console.log('Recording %d bytes', data.length)
    })

    rec.on('end', function () {
      console.timeEnd('End Recording')
    })
  }

  return rec
}

function stopRecording () {
  if (!cp) {
    console.log('Please start a recording first')
    return false
  }

  cp.kill() // Exit the spawned process, exit gracefully
  return cp
}

setTimeout(()=>{stopRecording()}, 10000)

const encoding = 'LINEAR16';
const sampleRateHertz = 16000;
const languageCode = 'it-IT';

function microphoneStream() { // (encoding, sampleRateHertz, languageCode) {
  // [START micStreamRecognize]

  // Imports the Google Cloud client library
  const speech = require('@google-cloud/speech');

  const config = {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
    enableWordTimeOffsets: true
  };

  const request = {
    config,
    interimResults: true, //Get interim results from stream
    enableWordTimeOffsets: true
  };

  // Creates a client
  const client = new speech.SpeechClient();

  // Create a recognize stream
  const recognizeStream = client
    .streamingRecognize(request)
    .on('error', console.error)
    .on('data', data =>
      //process.stdout.write(
        // data.results[0] && data.results[0].alternatives[0]
        //   ? `Transcription: ${data.results[0].alternatives[0].transcript}\n`
        //  : `\n\nReached transcription time limit, press Ctrl+C\n`
        console.log(data)
      //)
    );

  // Start recording and send the microphone input to the Speech API
  startRecording({
      sampleRateHertz: sampleRateHertz,
      threshold: 0.5, //silence threshold
      recordProgram: 'sox', // Try also "arecord" or "sox"
      silence: '5000.0', //seconds of silence before ending
    })
    .pipe(recognizeStream);

  console.log('Listening, press Ctrl+C to stop.');
  // [END micStreamRecognize]
}

microphoneStream();

这里是非流媒体的地方

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');
const fs = require('fs');

// Creates a client
const client = new speech.SpeechClient();

// The name of the audio file to transcribe
const fileName = './test.raw';

// Reads a local audio file and converts it to base64
const file = fs.readFileSync(fileName);
const audioBytes = file.toString('base64');

// The audio file's encoding, sample rate in hertz, and BCP-47 language code
const audio = {
  content: audioBytes,
};
const config = {
  encoding: 'LINEAR16',
  sampleRateHertz: 16000,
  languageCode: 'it-IT',
  enableAutomaticPunctuation: true,
  enableWordTimeOffsets: true
};
const request = {
  audio: audio,
  config: config,
};

// Detects speech in the audio file
client
  .recognize(request)
  .then(data => {
    console.log(data)
    const response = data[0];
    const transcription = response.results
      .map(result => result.alternatives[0].transcript)
      .join('\n');
    console.log(`Transcription: ${transcription}`);
  })
  .catch(err => {
    console.error('ERROR:', err);
  });

0 个答案:

没有答案