我想使用Google语音引擎和Go lang HTTP服务器进行语音转文本应用。 一切都按预期工作,但我面临一个问题,无法找到我做错的地方。 问题是,当我开始讲谷歌语音引擎响应只有3-4秒,之后我需要再次启动或1分钟到期然后启动。 我是GO-lang的初学者而且我只花了2天时间进行调试。请帮帮我。我需要你的帮助。
先谢谢
Go-Lang来源:main.go,识别包
package main
import(
"fmt"
"log"
"net/http"
"strings"
"vrecognize"
gmux "github.com/gorilla/mux"
"github.com/gorilla/websocket"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)
var upgrader = websocket.Upgrader{} // use default options
var voiceRecognize = new(vrecognize.VoiceRecognize)
var voiceStream vrecognize.VoiceStream
func main() {
mux := gmux.NewRouter().StrictSlash(true)
mux.HandleFunc("/echo", echo)
mux.PathPrefix("/").Handler(http.FileServer(http.Dir("./public")))
http.ListenAndServe(":8080", mux)
}
func echo(w http.ResponseWriter, r *http.Request) {
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
log.Print("upgrade:", err)
return
}
defer conn.Close()
voiceRecognize := voiceRecognize.NewRecongnize(1, 48000, "en_US")
var stream speechpb.Speech_StreamingRecognizeClient
var msgStr string
for {
mt, message, err := conn.ReadMessage()
if err != nil {
log.Println("read:", err)
return
}
// Message received as Text
if mt == websocket.TextMessage {
//Convert byte to string
msgStr = string(message)
//is received msg if command of text
isEcho := strings.HasPrefix(msgStr, "echo")
//if text, will remove echo
if isEcho {
msgStr = msgStr[4:len(msgStr)]
}
//start Audio, will initalize Audio
if msgStr == "start" {
stream = voiceStream.InitAudio(voiceRecognize.Encoding, voiceRecognize.SampleRateHertz, voiceRecognize.LanguageCode)
if stream == nil {
fmt.Println("initAudio failed!!!")
conn.WriteMessage(websocket.TextMessage, []byte("speechInitFail"))
break
} else {
//Routine will listen the result
go voiceStream.GetResults(&stream, conn, voiceRecognize)
}
} else if msgStr == "stop" { //on stop will close the stream conn
if err := stream.CloseSend(); err != nil {
log.Printf("Could not close stream: %v", err)
break
}
} else if isEcho { //echo message
log.Printf("recv: %s", msgStr)
err = conn.WriteMessage(mt, []byte(msgStr))
if err != nil {
log.Println("write: ", err)
break
}
} else {
fmt.Println("no handling for: ", string(message))
}
} else if mt == websocket.BinaryMessage {
//Send voice to process
voiceStream.SendData(&stream, &message)
}
}
}
---------------------------
package vrecognize
import (
"io"
"log"
"cloud.google.com/go/speech/apiv1"
"github.com/gorilla/websocket"
"golang.org/x/net/context"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)
//VoiceStream : Voice related configuratin
type VoiceStream struct {
}
//GetResults : Take the stream result whenever available
func (vs VoiceStream) GetResults(stream
*speechpb.Speech_StreamingRecognizeClient, conn *websocket.Conn, vr
*VoiceRecognize) {
defer func() {
if r := recover(); r != nil {
log.Println("In getResults got panic: ", r)
*stream = vs.InitAudio(vr.Encoding, vr.SampleRateHertz, vr.LanguageCode)
}
}()
for {
resp, err := (*stream).Recv()
if err == io.EOF {
log.Printf("Reciever EOF: %v", err)
conn.WriteMessage(websocket.TextMessage, []byte("MinuteDone"))
}
if err != nil {
log.Printf("Cannot stream results: %v", err)
conn.WriteMessage(websocket.TextMessage, []byte("VoiceInterrupted"))
}
if err := resp.Error; err != nil {
log.Printf("Could not recognize: %v", err)
conn.WriteMessage(websocket.TextMessage, []byte("NotRecognize"))
}
for _, result := range resp.Results {
for _, altr := range result.GetAlternatives() {
msg := altr.GetTranscript()
log.Printf("Result: %+v\n", msg)
conn.WriteMessage(websocket.TextMessage, []byte(msg))
}
}
//log.Printf("Loop last...")
}
}
//InitAudio - This will initalize the Audio and will listen the incomming audio
func (vs VoiceStream) InitAudio(audioEnco speechpb.RecognitionConfig_AudioEncoding, sampleRate int32, lang string) speechpb.Speech_StreamingRecognizeClient {
//log.Printf("Speech Init strat...")
ctx := context.Background()
client, err := speech.NewClient(ctx)
if err != nil {
log.Fatal(err)
return nil
}
stream, err2 := client.StreamingRecognize(ctx)
if err2 != nil {
log.Fatal(err2)
return nil
}
//Send 1st send configuration
err = stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &speechpb.StreamingRecognitionConfig{
Config: &speechpb.RecognitionConfig{
Encoding: audioEnco,
SampleRateHertz: sampleRate,
LanguageCode: lang,
},
InterimResults: true,
SingleUtterance: true,
},
},
})
if err != nil {
log.Fatal(err)
return nil
}
//log.Printf("Speech Init finished...")
return stream
}
//SendData This will send the data to Speech API
func (vs VoiceStream) SendData(stream *speechpb.Speech_StreamingRecognizeClient, message *[]byte) {
//log.Printf("SendData Called ...")
if err := (*stream).Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: *message,
},
}); err != nil {
log.Printf("Could not send audio: %v", err)
}
return
}
//VoiceRecognize Class
type VoiceRecognize struct {
Encoding speechpb.RecognitionConfig_AudioEncoding
SampleRateHertz int32
LanguageCode string
}
//NewRecongnize : pass parameters
func (vr VoiceRecognize) NewRecongnize(audioencoding, HertzRate int32,
language string) *VoiceRecognize {
var Encoding speechpb.RecognitionConfig_AudioEncoding
switch audioencoding {
case 0:
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT]
[google.rpc.Code.INVALID_ARGUMENT].
Encoding = speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
case 1:
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
Encoding = speechpb.RecognitionConfig_LINEAR16
case 2:
// [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
// Codec) is the recommended encoding because it is
// lossless--therefore recognition is not compromised--and
// requires only about half the bandwidth of `LINEAR16`.
Encoding = speechpb.RecognitionConfig_FLAC
case 3:
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
Encoding = speechpb.RecognitionConfig_MULAW
case 4:
// Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
Encoding = speechpb.RecognitionConfig_AMR
case 5:
// Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
Encoding = speechpb.RecognitionConfig_AMR_WB
case 6:
// Opus encoded audio frames in Ogg container
// ([OggOpus](https://wiki.xiph.org/OggOpus)).
// `sample_rate_hertz` must be 16000.
Encoding = speechpb.RecognitionConfig_OGG_OPUS
default:
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
Encoding = speechpb.RecognitionConfig_LINEAR16
}
return &VoiceRecognize{Encoding, HertzRate, language}
}
和index.html代码:
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Golang WebSocket</title>
</head>
<body>
<form>
<input id="message" type="text" value="What are you doing">
<input onclick="wsConnect();" id="connectBtn" value="Connect"
type="button"/>
<input onclick="wsSendMessage();" id="echoBtn" value="Echo" type="button"/>
<br/>
<input onclick="startAudio();" id="startBtn" value="Start Audio" type="button"/>
</form>
<br/>
<h2>Log</h2>
<pre id="log"></pre>
<h2>Server Response</h2>
<pre id="serResp"></pre>
<script type="text/javascript">
var webSocket = null;
var audioStream = null;
var context = null;
function __log(e, data) {
if(e.type !="error"){
log.innerHTML += "\n" + e + " " + (data || '');
}
}
function serverResponse(e, data){
serResp.innerHTML = "\n" + e + " " + (data || '');
}
function wsConnect() {
webSocket = new WebSocket("ws://"+window.location.host+"/echo");
var message = document.getElementById("message");
webSocket.onopen = function (message) {
wsOpen(message);
};
webSocket.onmessage = function (message) {
wsGetMessage(message);
};
webSocket.onclose = function (message) {
wsClose(message);
};
webSocket.onerror = function (message) {
wsError(message);
};
}
function wsError(message) {
console.log(message);
__log(message);
}
function wsOpen(message) {
__log("Connected ...");
if(message.type === "open"){
document.getElementById("connectBtn").disabled = true;
document.getElementById("startBtn").disabled = false;
document.getElementById("echoBtn").disabled = false;
if(document.getElementById("stopBtn") != undefined){
document.getElementById("stopBtn").disabled = false;
}
}
}
function wsSendMessage() {
__log("From Browser: "+ message.value);
webSocket.send("echo"+message.value);
document.getElementById("message").value = "";
}
function wsCloseConnection() {
webSocket.close();
}
function wsGetMessage(message) {
console.log("Server: "+ message.data);
msg = message.data;
if(msg == "VoiceInterrupted" || msg == "MinuteDone" || msg ==
"NotRecognize" || msg == "speechInitFail"){
stopAudio();
startAudio();
}else{
serverResponse("Server: " + message.data);
}
}
function wsClose(message) {
__log("Disconnect ... ");
if(message.type === "close"){
document.getElementById("connectBtn").disabled = false;
document.getElementById("startBtn").disabled = true;
document.getElementById("echoBtn").disabled = true;
}
}
function wserror(message) {
__log("Error ..."+ message);
}
function sendStart() {
webSocket.send("start")
}
function startAudio() {
var session = {
audio: true,
video: false
};
sendStart();
var isMediaReady = true;
navigator.mediaDevices.getUserMedia(session).then(function (stream) {
initializeRecorder(stream);
}).catch(function (err) {
onError(err);
document.getElementById("startBtn").disabled = false;
});
document.getElementById("startBtn").disabled = true;
setTimeout(stopAudio,55*1000);
}
function onError(err) {
__log("Error while calling getUserMedia:" + err);
}
function initializeRecorder(stream) {
__log("initializeRecorder called...");
audioStream = stream;
var audioContext = window.AudioContext;
context = new audioContext();
var audioInput = context.createMediaStreamSource(stream);
var bufferSize = 1*4*1024;
// create a javascript node
var recorder = context.createScriptProcessor(bufferSize, 1, 1);
// specify the processing function
recorder.onaudioprocess = recorderProcess;
// connect stream to our recorder
audioInput.connect(recorder);
// connect our recorder to the previous destination
recorder.connect(context.destination);
}
function convertFloat32ToInt16(buffer) {
l = buffer.length;
buf = new Int16Array(l);
while (l--) {
buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
}
return buf.buffer;
}
function recorderProcess(e) {
var left = e.inputBuffer.getChannelData(0);
webSocket.send(convertFloat32ToInt16(left));
}
function stopAudio() {
__log("stop audio called...");
document.getElementById("startBtn").disabled = false;
context.close();
context = null;
webSocket.send("stop")
var audioTrack = audioStream.getAudioTracks();
var i = 0;
for (i = 0; i < audioTrack.length; i++) {
var track = audioTrack[i];
track.stop();
audioStream.removeTrack(track);
}
audioStream = null;
}
</script>
</body>
</html>