请帮我解决这个问题,我将近1周的时间解决了这个问题,但我不能帮我这是我的日志,演示中的结果是相同的 这是我在RecognizeOptions中的代码
private RecognizeOptions getRecognizeOptions(InputStream captureStream) {
return new RecognizeOptions.Builder()
.timestamps(true)
.contentType(HttpMediaType.AUDIO_MP3)
.interimResults(true)
.smartFormatting(true)
.inactivityTimeout(2000)
.speakerLabels(true)
.audio(captureStream)
.wordAlternativesThreshold((float) 0.01)
.wordConfidence(true)
.model("en-US_BroadbandModel")
.build();
}
这是我显示说话者和单词的代码
私有类MicrophoneRecognizeDelegate扩展了BaseRecognizeCallback {
@Override
public void onTranscription(SpeechRecognitionResults speechResults) {
System.out.println(speechResults);
SpeakerLabelsDiarization.RecoTokens recoTokens = new SpeakerLabelsDiarization.RecoTokens();
if(speechResults.getSpeakerLabels() !=null) {
recoTokens.add(speechResults);
String speaks = "Speaker " + speechResults.getSpeakerLabels().get(0).getSpeaker().toString();
showMicText(speaks);
System.out.println(speaks);
}if(speechResults.getResults() != null && !speechResults.getResults().isEmpty()) {
String text = speechResults.getResults().get(0).getAlternatives().get(0).getTranscript();
showMicText1(text);
}
}
关于SpeakerLabelsDiarization.RecoTokens recoTokens =新的SpeakerLabelsDiarization.RecoTokens();这是我在github的WatBot示例中得到的代码
package com.example.ezminute.activities;
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.RecognizeOptions;
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeakerLabelsResult;
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechRecognitionAlternative;
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechRecognitionResult;
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechRecognitionResults;
import com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechTimestamp;
import com.ibm.watson.developer_cloud.speech_to_text.v1.websocket.BaseRecognizeCallback;
import com.ibm.watson.developer_cloud.util.GsonSingleton;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
public class SpeakerLabelsDiarization {
public static class RecoToken {
private Double startTime;
private Double endTime;
private Long speaker;
private String word;
private Boolean spLabelIsFinal;
/**
* Instantiates a new reco token.
*
* @param speechTimestamp the speech timestamp
*/
RecoToken(SpeechTimestamp speechTimestamp) {
startTime = speechTimestamp.getStartTime();
endTime = speechTimestamp.getEndTime();
word = speechTimestamp.getWord();
}
/**
* Instantiates a new reco token.
*
* @param speakerLabel the speaker label
*/
RecoToken(SpeakerLabelsResult speakerLabel) {
startTime = Double.valueOf(speakerLabel.getFrom());
endTime = Double.valueOf(speakerLabel.getTo());
speaker = speakerLabel.getSpeaker();
}
/**
* Update from.
*
* @param speechTimestamp the speech timestamp
*/
public void updateFrom(SpeechTimestamp speechTimestamp) {
word = speechTimestamp.getWord();
}
/**
* Update from.
*
* @param speakerLabel the speaker label
*/
public void updateFrom(SpeakerLabelsResult speakerLabel) {
speaker = speakerLabel.getSpeaker();
}
}
/**
* The Class Utterance.
*/
public static class Utterance {
private Integer speaker;
private String transcript;
/**
* Instantiates a new utterance.
*
* @param speaker the speaker
* @param transcript the transcript
*/
public Utterance(final Integer speaker, final String transcript) {
this.speaker = speaker;
this.transcript = transcript;
}
}
/**
* The Class RecoTokens.
*/
public static class RecoTokens {
private Map<Double, RecoToken> recoTokenMap;
/**
* Instantiates a new reco tokens.
*/
public RecoTokens() {
recoTokenMap = new LinkedHashMap<Double, RecoToken>();
}
/**
* Adds the.
*
* @param speechResults the speech results
*/
public void add(SpeechRecognitionResults speechResults) {
if (speechResults.getResults() != null)
for (int i = 0; i < speechResults.getResults().size(); i++) {
SpeechRecognitionResult transcript = speechResults.getResults().get(i);
if (transcript.isFinalResults()) {
SpeechRecognitionAlternative speechAlternative = transcript.getAlternatives().get(0);
for (int ts = 0; ts < speechAlternative.getTimestamps().size(); ts++) {
SpeechTimestamp speechTimestamp = speechAlternative.getTimestamps().get(ts);
add(speechTimestamp);
}
}
}
if (speechResults.getSpeakerLabels() != null)
{
for (int i = 0; i < speechResults.getSpeakerLabels().size(); i++) {
add(speechResults.getSpeakerLabels().get(i));
}
}
}
/**
* Adds the.
*
* @param speechTimestamp the speech timestamp
*/
public void add(SpeechTimestamp speechTimestamp) {
RecoToken recoToken = recoTokenMap.get(speechTimestamp.getStartTime());
if (recoToken == null) {
recoToken = new RecoToken(speechTimestamp);
recoTokenMap.put(speechTimestamp.getStartTime(), recoToken);
} else {
recoToken.updateFrom(speechTimestamp);
}
}
/**
* Adds the.
*
* @param speakerLabel the speaker label
*/
public void add(SpeakerLabelsResult speakerLabel) {
RecoToken recoToken = recoTokenMap.get(speakerLabel.getFrom());
if (recoToken == null) {
recoToken = new RecoToken(speakerLabel);
recoTokenMap.put(Double.valueOf(speakerLabel.getFrom()), recoToken);
} else {
recoToken.updateFrom(speakerLabel);
}
if (speakerLabel.isFinalResults()) {
markTokensBeforeAsFinal(speakerLabel.getFrom());
report();
cleanFinal();
}
}
private void markTokensBeforeAsFinal(Float from) {
Map<Double, RecoToken> recoTokenMap = new LinkedHashMap<>();
for (RecoToken rt : recoTokenMap.values()) {
if (rt.startTime <= from)
{
rt.spLabelIsFinal = true;
}
}
}
/**
* Report.
*/
public void report() {
List<Utterance> uttterances = new ArrayList<Utterance>();
Utterance currentUtterance = new Utterance(0, "");
for (RecoToken rt : recoTokenMap.values()) {
if (currentUtterance.speaker != Math.toIntExact(rt.speaker)) {
uttterances.add(currentUtterance);
currentUtterance = new Utterance(Math.toIntExact(rt.speaker), "");
}
currentUtterance.transcript = String.format("%s%s ", currentUtterance.transcript, rt.word);
}
uttterances.add(currentUtterance);
String result = GsonSingleton.getGson().toJson(uttterances);
System.out.println(result);
}
private void cleanFinal() {
Set<Map.Entry<Double, RecoToken>> set = recoTokenMap.entrySet();
for (Map.Entry<Double, RecoToken> e : set) {
if (e.getValue().spLabelIsFinal) {
recoTokenMap.remove(e.getKey());
}
}
}
}
private static CountDownLatch lock = new CountDownLatch(1);
}
这是我的结果
{
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "speaker_labels": [
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: {
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "confidence": 0.605,
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "final": false,
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "from": 2.43,
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "speaker": 0,
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "to": 2.93
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: },
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: {
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "confidence": 0.667,
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "final": false,
2019-03-03 23:21:58.335 27985-28793/com.example.ezminute I/System.out: "from": 3.3,
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "speaker": 1,
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "to": 3.82
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: },
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: {
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "confidence": 0.579,
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "final": false,
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "from": 4.15,
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "speaker": 0,
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: "to": 4.69
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: }
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: ]
2019-03-03 23:21:58.336 27985-28793/com.example.ezminute I/System.out: }
2019-03-03 23:21:58.340 4354-4354/? D/io_stats: !@ 179,0 r 137002 5177089 w 48914 1018780 d 7995 317604 f 12673 12674 iot 105310 98590 th 51200 0 0 pt 0 inp 0 0 3948.151
2019-03-03 23:21:58.345 27985-28793/com.example.ezminute I/System.out: {
2019-03-03 23:21:58.345 27985-28793/com.example.ezminute I/System.out: "speaker_labels": [
2019-03-03 23:21:58.345 27985-28793/com.example.ezminute I/System.out: {
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: "confidence": 0.579,
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: "final": true,
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: "from": 4.15,
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: "speaker": 0,
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: "to": 4.69
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: }
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: ]
2019-03-03 23:21:58.346 27985-28793/com.example.ezminute I/System.out: }
这是演示中的结果
{
"speaker_labels": [
{
"from": 2.43,
"to": 2.93,
"speaker": 0,
"confidence": 0.605,
"final": false
},
{
"from": 3.3,
"to": 3.82,
"speaker": 1,
"confidence": 0.667,
"final": false
},
{
"from": 4.15,
"to": 4.69,
"speaker": 0,
"confidence": 0.579,
"final": false
}
]
}
{
"speaker_labels": [
{
"from": 4.15,
"to": 4.69,
"speaker": 0,
"confidence": 0.579,
"final": true
}
]
}
这是我的输出
Speaker0: hello hi hello
这是演示中的输出
Speaker 0:
Hello.
Speaker 1:
Hi.
Speaker 0:
Hello.
请帮助我,我求大家,请帮助我:( :( :(我不知道该怎么办,请尽我所能,但还不够,我需要所有帮助,请帮助我: (:(:(
答案 0 :(得分:1)
使用这么小的音频轨道,我假设onTranscription
仅被调用一次。
我认为您的意图是利用SpeakerLabelsDiarization.RecoTokens来确定谁说什么以及何时说,但您并未使用它。您要做的只是实例化
SpeakerLabelsDiarization.RecoTokens recoTokens = new SpeakerLabelsDiarization.RecoTokens();
并添加STT响应
recoTokens.add(speechResults);
,但是onTranscription
结束,recoTokens
超出范围并消失。使整个运动毫无意义。
您的输出来自
String speaks = "Speaker " + speechResults.getSpeakerLabels().get(0).getSpeaker().toString();
这直接来自您从STT服务获得的响应。 get(0)
将获得并允许您打印数组中的第0个(第1个)元素。由于没有迭代,因此仅需打印即可。
如果要查看所有扬声器标签,则需要一个for循环来遍历响应中的所有扬声器标签。
您输出的成绩单是完整的成绩单,并且没有按演讲者标签或时间细分。
String text = speechResults.getResults().get(0).getAlternatives().get(0).getTranscript();
我认为您想使用您设置的recoTokens
,在进行简短检查后,您将使用
recoTokens.uttterances()