如何在iOS中将语音转换为文本

时间:2017-08-16 09:42:48

标签: ios objective-c swift speech-recognition speech-to-text

据我所知,苹果原生框架没有用于将语音转换为文本的API,我们必须采用第三方框架来做到这一点,并且它有很多缺点,例如用户必须将麦克风从语音转换为文本。

但我可以找到很多关于将文本转换为语音的信息,而不是其他方式

无法找到关于此的任何明确信息,而且大多数情况下都有很多不确定的事情。

如果有人可以发光,那真的很棒!

2 个答案:

答案 0 :(得分:2)

对于Objective C,我回过头来编写了一个语音转换器类,将语音转换为文本。

步骤1:创建语音转换器类

  1. 创建一个新的Cocoa类,并从NSObject创建它的子类。
  2. 将其命名为ATSpeechRecognizer。
  3. 在ATSpeechRecognizer.h中:

    #import <Foundation/Foundation.h>
    #import <Speech/Speech.h>
    #import <AVFoundation/AVFoundation.h>
    
    typedef NS_ENUM(NSInteger, ATSpeechRecognizerState) {
        ATSpeechRecognizerStateRunning,
        ATSpeechRecognizerStateStopped
    };
    
    @protocol ATSpeechDelegate<NSObject>
    @required
    /*This method relays parsed text from Speech to the delegate responder class*/
    -(void)convertedSpeechToText:(NSString *) parsedText;
    /*This method relays change in Speech recognition ability to delegate responder class*/
    -(void) speechRecAvailabilityChanged:(BOOL) status;
    /*This method relays error messages to delegate responder class*/
    -(void) sendErrorInfoToViewController:(NSString *) errorMessage;
    @optional
    /*This method relays info regarding whether speech rec is running or stopped to delegate responder class. State with be either ATSpeechRecognizerStateRunning or ATSpeechRecognizerStateStopped. You may or may not implement this method*/
    -(void) changeStateIndicator:(ATSpeechRecognizerState) state;
    @end
    
    @interface ATSpeechRecognizer : NSObject <SFSpeechRecognizerDelegate>
    
    + (ATSpeechRecognizer *)sharedObject;
    
    /*Delegate to communicate with requesting VCs*/
    @property (weak, nonatomic) id<ATSpeechDelegate> delegate;
    
    /*Class Methods*/
    -(void) toggleRecording;
    -(void) activateSpeechRecognizerWithLocaleIdentifier:(NSString *) localeIdentifier andBlock:(void (^)(BOOL isAuthorized))successBlock;
    @end
    

    在ATSpeechRecognizer.m中:

    #import "ATSpeechRecognizer.h"
    
    @interface ATSpeechRecognizer ()
    
    /*This object handles the speech recognition requests. It provides an audio input to the speech recognizer.*/
    
    @property SFSpeechAudioBufferRecognitionRequest *speechAudioRecRequest;
    
    /*The recognition task where it gives you the result of the recognition request. Having this object is handy as you can cancel or stop the task. */
    
    @property SFSpeechRecognitionTask *speechRecogTask;
    
    /*This is your Speech recognizer*/
    @property SFSpeechRecognizer *speechRecognizer;
    
    /*This is your audio engine. It is responsible for providing your audio input.*/
    
    @property AVAudioEngine *audioEngine;
    
    @end
    
    @implementation ATSpeechRecognizer
    
    
    
    #pragma mark - Constants
    
    //Error Messages
    #define kErrorMessageAuthorize  @"You declined the permission to perform speech Permission. Please authorize the operation in your device settings."
    #define kErrorMessageRestricted @"Speech recognition isn't available on this OS version. Please upgrade to iOS 10 or later."
    #define kErrorMessageNotDetermined  @"Speech recognition isn't authorized yet"
    #define kErrorMessageAudioInputNotFound @"This device has no audio input node"
    #define kErrorMessageRequestFailed @"Unable to create an SFSpeechAudioBufferRecognitionRequest object"
    #define kErrorMessageAudioRecordingFailed   @"Unable to start Audio recording due to failure in Recording Engine"
    
    #pragma mark - Singleton methods
    
    + (ATSpeechRecognizer *)sharedObject {
        static ATSpeechRecognizer *sharedClass = nil;
        static dispatch_once_t onceToken;
        dispatch_once(&onceToken, ^{
            sharedClass = [[self alloc] init];
        });
        return sharedClass;
    }
    
    - (id)init {
        if (self = [super init]) {
    
        }
        return self;
    }
    
    #pragma mark - Recognition methods
    
    -(void) activateSpeechRecognizerWithLocaleIdentifier:(NSString *) localeIdentifier andBlock:(void (^)(BOOL isAuthorized))successBlock{
        //enter Described language here
        if([localeIdentifier length]>0){
            NSLocale *locale = [[NSLocale alloc] initWithLocaleIdentifier:localeIdentifier];
            _speechRecognizer = [[SFSpeechRecognizer alloc] initWithLocale:locale];
            _speechRecognizer.delegate = self;
            _audioEngine = [[AVAudioEngine alloc] init];
            [self getSpeechRecognizerAuthenticationStatusWithSuccessBlock:^(BOOL isAuthorized) {
                successBlock(isAuthorized);
            }];
        }
        else{
            successBlock(NO);
        }
    
    }
    
    /*Microphone usage Must be authorized in the info.plist*/
    
    -(void) toggleRecording{
        if(_audioEngine.isRunning){
            [self stopAudioEngine];
        }
        else{
            [self startAudioEngine];
        }
    }
    
    
    #pragma mark - Internal Methods
    
    /*
     In case different buttons are used for recording and stopping, these methods should be called indiviually. Otherwise use -(void) toggleRecording.
     */
    
    -(void) startAudioEngine{
        if([self isDelegateValidForSelector:NSStringFromSelector(@selector(changeStateIndicator:))]){
            [_delegate changeStateIndicator:ATSpeechRecognizerStateRunning];
        }
    
        [self startRecordingSpeech];
    }
    
    -(void) stopAudioEngine{
        if([self isDelegateValidForSelector:NSStringFromSelector(@selector(changeStateIndicator:))]){
           [_delegate changeStateIndicator:ATSpeechRecognizerStateStopped];
        }
        [_audioEngine stop];
        [_speechAudioRecRequest endAudio];
        self.speechRecogTask = nil;
        self.speechAudioRecRequest = nil;
    }
    
    /*
     All the voice data is transmitted to Apple’s backend for processing. Therefore, it is mandatory to get the user’s authorization. Speech Recognition Must be authorized in the info.plist
     */
    
    -(void) getSpeechRecognizerAuthenticationStatusWithSuccessBlock:(void (^)(BOOL isAuthorized))successBlock{
        [SFSpeechRecognizer requestAuthorization:^(SFSpeechRecognizerAuthorizationStatus status) {
    
            switch (status) {
                case SFSpeechRecognizerAuthorizationStatusAuthorized:
                    successBlock(YES);
                    break;
                case SFSpeechRecognizerAuthorizationStatusDenied:
                    [self sendErrorMessageToDelegate:kErrorMessageAuthorize];
                    successBlock(NO);
    
                case SFSpeechRecognizerAuthorizationStatusRestricted:
                    [self sendErrorMessageToDelegate:kErrorMessageRestricted];
                    successBlock(NO);
                case SFSpeechRecognizerAuthorizationStatusNotDetermined:
                    [self sendErrorMessageToDelegate:kErrorMessageNotDetermined];
                    successBlock(NO);
                    break;
                default:
                    break;
            }
        }];
    }
    
    -(void) startRecordingSpeech{
    
        /*
         Check if the Task is running. If yes, Cancel it and start anew
         */
        if(_speechRecogTask!=nil){
            [_speechRecogTask cancel];
            _speechRecogTask = nil;
        }
    
        /*
         Prepare for the audio recording. Here we set the category of the session as recording, the mode as measurement, and activate it
         */
    
        AVAudioSession *audioSession = [AVAudioSession sharedInstance];
        @try {
            [audioSession setCategory:AVAudioSessionCategoryRecord error:nil];
            [audioSession setMode:AVAudioSessionModeMeasurement error:nil];
            [audioSession setActive:YES error:nil];
        } @catch (NSException *exception) {
            [self sendErrorMessageToDelegate:exception.reason];
        }
    
    
        /*
         Instantiate the recognitionRequest. Here we create the SFSpeechAudioBufferRecognitionRequest object. Later, we use it to pass our audio data to Apple’s servers.
         */
        @try {
            _speechAudioRecRequest = [[SFSpeechAudioBufferRecognitionRequest alloc] init];
        } @catch (NSException *exception) {
            [self sendErrorMessageToDelegate:kErrorMessageRequestFailed];
        }
    
    
        /*
         Check if the audioEngine (your device) has an audio input for recording.
         */
        if(_audioEngine.inputNode!=nil){
            AVAudioInputNode *inputNode = _audioEngine.inputNode;
    
            /*If true, partial (non-final) results for each utterance will be reported.
             Default is true*/
            _speechAudioRecRequest.shouldReportPartialResults = YES;
    
            /*Start the recognition by calling the recognitionTask method of our speechRecognizer. This function has a completion handler. This completion handler will be called every time the recognition engine has received input, has refined its current recognition, or has been canceled or stopped, and will return a final transcript.*/
    
            _speechRecogTask = [_speechRecognizer recognitionTaskWithRequest:_speechAudioRecRequest resultHandler:^(SFSpeechRecognitionResult * _Nullable result, NSError * _Nullable error) {
    
                BOOL isFinal = NO;
                if(result!=nil){
                    if([self isDelegateValidForSelector:NSStringFromSelector(@selector(convertedSpeechToText:))]){
                        [_delegate convertedSpeechToText:[[result bestTranscription] formattedString]];
                    }
                    isFinal = [result isFinal]; //True if the hypotheses will not change; speech processing is complete.
    
                }
    
                //If Error of Completed, end it.
                if(error!=nil || isFinal){
                    [_audioEngine stop];
                    [inputNode removeTapOnBus:0];
                    self.speechRecogTask = nil;
                    self.speechAudioRecRequest = nil;
                    if(error!=nil){
                        [self stopAudioEngine];
                        [self sendErrorMessageToDelegate:[NSString stringWithFormat:@"%li - %@",error.code, error.localizedDescription]];
    
                    }
                }
    
            }];
    
            /* Add an audio input to the recognitionRequest. Note that it is ok to add the audio input after starting the recognitionTask. The Speech Framework will start recognizing as soon as an audio input has been added.*/
    
            AVAudioFormat *recordingFormat = [inputNode outputFormatForBus:0];
            [inputNode installTapOnBus:0 bufferSize:1024 format:recordingFormat block:^(AVAudioPCMBuffer * _Nonnull buffer, AVAudioTime * _Nonnull when) {
                [self.speechAudioRecRequest appendAudioPCMBuffer:buffer];
            }];
    
            /*Prepare and start the audioEngine.*/
            [_audioEngine prepare];
            @try {
                [_audioEngine startAndReturnError:nil];
            } @catch (NSException *exception) {
                [self sendErrorMessageToDelegate:kErrorMessageAudioRecordingFailed];
            }
    
        }
        else{
            [self sendErrorMessageToDelegate:kErrorMessageAudioInputNotFound];
        }
    
    
    }
    
    -(BOOL) isDelegateValidForSelector:(NSString*)selectorName{
        if(_delegate!=nil && [_delegate respondsToSelector:NSSelectorFromString(selectorName)]){
            return YES;
        }
        return NO;
    }
    
    -(void) sendErrorMessageToDelegate:(NSString*) errorMessage{
        if([self isDelegateValidForSelector:NSStringFromSelector(@selector(sendErrorInfoToViewController:))]){
            [_delegate sendErrorInfoToViewController:errorMessage];
        }
    }
    
    #pragma mark - Speech Recognizer Delegate Methods
    
    -(void) speechRecognizer:(SFSpeechRecognizer *)speechRecognizer availabilityDidChange:(BOOL)available{
        if(!available){
            [self stopAudioEngine];
        }
        [_delegate speechRecAvailabilityChanged:available];
    }
    

    就是这样。 现在,您可以在要将语音转换为文本的任何项目中的任何位置使用此类。如果您对它的工作方式感到困惑,请务必阅读指导意见。

    步骤2:在VC中设置ATSpeechRecognizer类

    在View Controller中导入ATSpeechRecognizer并设置如下代理:

    #import "ATSpeechRecognizer.h"
    @interface ViewController : UIViewController <ATSpeechDelegate>{
        BOOL isRecAllowed;
    }
    

    在VC viewDidLoad上使用以下方法进行设置并运行:

    -(void) setUpSpeechRecognizerService{
        [ATSpeechRecognizer sharedObject].delegate = self;
        [[ATSpeechRecognizer sharedObject] activateSpeechRecognizerWithLocaleIdentifier:@"en-US" andBlock:^(BOOL isAuthorized) {
            isRecAllowed = isAuthorized; /*Is operation allowed or not?*/
        }];
    }
    

    现在设置委托方法:

    #pragma mark - Speech Recog Delegates
    
    -(void) convertedSpeechToText:(NSString *)parsedText{
        if(parsedText!=nil){
            _txtView.text = parsedText; //You got Text from voice. Use it as you want
        }
    
    }
    
    -(void) speechRecAvailabilityChanged:(BOOL)status{
        isRecAllowed = status; //Status of Conversion ability has changed. Use Status flag to allow/stop operations
    }
    
    -(void) changeStateIndicator:(ATSpeechRecognizerState) state{
        if(state==ATSpeechRecognizerStateStopped){
            //Speech Recognizer is Stopped
            _lblState.text = @"Stopped";
    
        }
        else{
            //Speech Recognizer is running
            _lblState.text = @"Running";
        }
        _txtView.text = @"";
    }
    
    -(void) sendErrorInfoToViewController:(NSString *)errorMessage{
        [self showPopUpForErrorMessage:errorMessage]; /*Some error occured. Show it to user*/
    }
    

    开始将语音转换为文本:

    - (IBAction)btnRecordTapped:(id)sender {
        if(!isRecAllowed){
            [self showPopUpForErrorMessage:@"Speech recognition is either not authorized or available for this device. Please authorize the operation or upgrade to latest iOS. If you have done all this, check your internet connectivity"];
        }
        else{
            [[ATSpeechRecognizer sharedObject] toggleRecording]; /*If speech Recognizer is running, it will turn it off. if it is off, it will set it on*/
    
            /*
             If you want to do it mannually, use startAudioEngine method and stopAudioEngine method to explicitly perform those operations instead of toggleRecording
    
             */
        }
    
    }
    

    就是这样。您需要的所有进一步解释都在代码注释中。如果您需要进一步解释,请告诉我。

答案 1 :(得分:1)

以下是相同的完整代码:

import UIKit
import Speech

public class ViewController: UIViewController, SFSpeechRecognizerDelegate {
    // MARK: Properties

    private let speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!

    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?

    private var recognitionTask: SFSpeechRecognitionTask?

    private let audioEngine = AVAudioEngine()

    @IBOutlet var textView : UITextView!

    @IBOutlet var recordButton : UIButton!

    // MARK: UIViewController

    public override func viewDidLoad() {
        super.viewDidLoad()

        // Disable the record buttons until authorization has been granted.
        recordButton.isEnabled = false
    }

    override public func viewDidAppear(_ animated: Bool) {
        speechRecognizer.delegate = self

        SFSpeechRecognizer.requestAuthorization { authStatus in
            /*
                The callback may not be called on the main thread. Add an
                operation to the main queue to update the record button's state.
            */
            OperationQueue.main.addOperation {
                switch authStatus {
                    case .authorized:
                        self.recordButton.isEnabled = true

                    case .denied:
                        self.recordButton.isEnabled = false
                        self.recordButton.setTitle("User denied access to speech recognition", for: .disabled)

                    case .restricted:
                        self.recordButton.isEnabled = false
                        self.recordButton.setTitle("Speech recognition restricted on this device", for: .disabled)

                    case .notDetermined:
                        self.recordButton.isEnabled = false
                        self.recordButton.setTitle("Speech recognition not yet authorized", for: .disabled)
                }
            }
        }
    }

    private func startRecording() throws {

        // Cancel the previous task if it's running.
        if let recognitionTask = recognitionTask {
            recognitionTask.cancel()
            self.recognitionTask = nil
        }

        let audioSession = AVAudioSession.sharedInstance()
        try audioSession.setCategory(AVAudioSessionCategoryRecord)
        try audioSession.setMode(AVAudioSessionModeMeasurement)
        try audioSession.setActive(true, with: .notifyOthersOnDeactivation)

        recognitionRequest = SFSpeechAudioBufferRecognitionRequest()

        guard let inputNode = audioEngine.inputNode else { fatalError("Audio engine has no input node") }
        guard let recognitionRequest = recognitionRequest else { fatalError("Unable to created a SFSpeechAudioBufferRecognitionRequest object") }

        // Configure request so that results are returned before audio recording is finished
        recognitionRequest.shouldReportPartialResults = true

        // A recognition task represents a speech recognition session.
        // We keep a reference to the task so that it can be cancelled.
        recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
            var isFinal = false

            if let result = result {
                self.textView.text = result.bestTranscription.formattedString
                isFinal = result.isFinal
            }

            if error != nil || isFinal {
                self.audioEngine.stop()
                inputNode.removeTap(onBus: 0)

                self.recognitionRequest = nil
                self.recognitionTask = nil

                self.recordButton.isEnabled = true
                self.recordButton.setTitle("Start Recording", for: [])
            }
        }

        let recordingFormat = inputNode.outputFormat(forBus: 0)
        inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
            self.recognitionRequest?.append(buffer)
        }

        audioEngine.prepare()

        try audioEngine.start()

        textView.text = "(Go ahead, I'm listening)"
    }

    // MARK: SFSpeechRecognizerDelegate

    public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
        if available {
            recordButton.isEnabled = true
            recordButton.setTitle("Start Recording", for: [])
        } else {
            recordButton.isEnabled = false
            recordButton.setTitle("Recognition not available", for: .disabled)
        }
    }

    // MARK: Interface Builder actions

    @IBAction func recordButtonTapped() {
        if audioEngine.isRunning {
            audioEngine.stop()
            recognitionRequest?.endAudio()
            recordButton.isEnabled = false
            recordButton.setTitle("Stopping", for: .disabled)
        } else {
            try! startRecording()
            recordButton.setTitle("Stop recording", for: [])
        }
    }
}