语音识别器在几分钟后失败

4
我正在开发一个iOS项目,使用了SFSpeechRecognizer,一开始它能够正常工作。我说出一些单词,它会做出响应。但是在一两分钟后,它就会失败。它不会给出任何已识别结果的反馈。
我想知道这是否与缓冲区有关,但我不知道该如何解决。
我基本上使用了SpeechRecognizer的演示程序来构建项目。不同之处在于,我将识别出的结果逐个存储在数组中。然后程序分析数组并对某些预先设置的单词(如“播放”或其他命令)做出响应。程序响应命令后,删除此元素。
废话少说,这是代码:
  1. The recognizer, you can see the supportedCommands array that filter some specific words for the program to respond. The other parts are similar to the demo at https://developer.apple.com/library/content/samplecode/SpeakToMe/Listings/SpeakToMe_ViewController_swift.html#//apple_ref/doc/uid/TP40017110-SpeakToMe_ViewController_swift-DontLinkElementID_6

    class SpeechRecognizer: NSObject, SFSpeechRecognizerDelegate {
    
        private var speechRecognizer: SFSpeechRecognizer!
        private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest!
        private var recognitionTask: SFSpeechRecognitionTask!
        private let audioEngine = AVAudioEngine()
        private let locale = Locale(identifier: "en-US")
    
        private var lastSavedString: String = ""
        private let supportedCommands = ["more", "play"]
    
        var speechInputQueue: [String] = [String]()
    
        func load() {
            print("load")
            prepareRecognizer(locale: locale)
    
            authorize()
        }
    
        func start() {
            print("start")
            if !audioEngine.isRunning {
                try! startRecording()
            }
        }
    
        func stop() {
            if audioEngine.isRunning {
                audioEngine.stop()
                recognitionRequest?.endAudio()
    
            }
        }
    
        private func authorize() {
            SFSpeechRecognizer.requestAuthorization { authStatus in
                OperationQueue.main.addOperation {
                    switch authStatus {
                    case .authorized:
                        print("Authorized!")
                    case .denied:
                        print("Unauthorized!")
                    case .restricted:
                        print("Unauthorized!")
                    case .notDetermined:
                        print("Unauthorized!")
                    }
                }
            }
        }
    
        private func prepareRecognizer(locale: Locale) {
            speechRecognizer = SFSpeechRecognizer(locale: locale)!
            speechRecognizer.delegate = self
        }
    
        private func startRecording() throws {
    
            // Cancel the previous task if it's running.
            if let recognitionTask = recognitionTask {
                recognitionTask.cancel()
                self.recognitionTask = nil
            }
    
            let audioSession = AVAudioSession.sharedInstance()
            try audioSession.setCategory(AVAudioSessionCategoryPlayAndRecord, with: .defaultToSpeaker)
            try audioSession.setMode(AVAudioSessionModeDefault)
            try audioSession.setActive(true, with: .notifyOthersOnDeactivation)
    
            recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
    
            let inputNode = audioEngine.inputNode
            guard let recognitionRequest = recognitionRequest else { fatalError("Unable to created a SFSpeechAudioBufferRecognitionRequest object") }
    
            // Configure request so that results are returned before audio recording is finished
            recognitionRequest.shouldReportPartialResults = true
    
            // A recognition task represents a speech recognition session.
            // We keep a reference to the task so that it can be cancelled.
            recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
                var isFinal = false
    
                if let result = result {
    
                    let temp = result.bestTranscription.formattedString.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines).lowercased()
                    //print("temp", temp)
                    if temp != self.lastSavedString && temp.count > self.lastSavedString.count {
    
                        var tempSplit = temp.split(separator: " ")
                        var lastSplit = self.lastSavedString.split(separator: " ")
                        while lastSplit.count > 0 {
                            if String(tempSplit[0]) == String(lastSplit[0]) {
                                tempSplit.remove(at: 0)
                                lastSplit.remove(at: 0)
                            }
                            else {
                                break
                            }
                        }
    
                        for command in tempSplit {
                            if self.supportedCommands.contains(String(command)) {
                                self.speechInputQueue.append(String(command))
                            }
                        }
                        self.lastSavedString = temp
    
                    }
                    isFinal = result.isFinal
                }
    
                if error != nil || isFinal {
                    self.audioEngine.stop()
                    inputNode.removeTap(onBus: 0)
                    self.recognitionRequest = nil
                    self.recognitionTask = nil
                }
            }
    
            let recordingFormat = inputNode.outputFormat(forBus: 0)
            inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
                self.recognitionRequest?.append(buffer)
            }
    
            audioEngine.prepare()
    
            try audioEngine.start()
    
        }
    }
    
  2. How we use it:

        if self.speechRecognizer.speechInputQueue.count > 0 {
        if self.speechRecognizer.speechInputQueue[0] == "more" {
            print("temp", temp)
            print("content", content)
           // isSpeakingContent = true
            self.textToSpeech(text: content)
        }
        else if self.speechRecognizer.speechInputQueue[0] == "play" {
            print("try to play")
            let soundURL = URL(fileURLWithPath: Bundle.main.path(forResource: "cascade", ofType: "wav")!)
    
            do {
                audioPlayer = try AVAudioPlayer(contentsOf: soundURL)
            }
            catch {
                print(error)
            }
            audioPlayer.prepareToPlay()
            audioPlayer.play()
        }
        else {
            self.textToSpeech(text: "unrecognized command")
        }
        self.speechRecognizer.speechInputQueue.remove(at: 0)
        print("after :", self.speechRecognizer.speechInputQueue)
    }
    

它会响应某些命令并播放一些音频。

缓冲区有问题吗?也许在一两分钟的识别后,缓冲区就已经满了?识别器随着时间的推移而失败。

1个回答

1

从音频文件中识别语音时怎么办?我能无限延长持续时间吗? - daniel
嗨@daniel,我也处于同样的情况。我试图找到一个解决方案和想法,我们可以将我们的mp3音频剪辑成许多1分钟的片段 - 对我来说,这听起来是一个好主意。 - undefined

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接