语音识别器在几分钟后失败

Question

语音识别器在几分钟后失败

iosswiftspeech-recognitionspeech-to-textsfspeechrecognizer

4

我正在开发一个iOS项目，使用了SFSpeechRecognizer，一开始它能够正常工作。我说出一些单词，它会做出响应。但是在一两分钟后，它就会失败。它不会给出任何已识别结果的反馈。

我想知道这是否与缓冲区有关，但我不知道该如何解决。

我基本上使用了SpeechRecognizer的演示程序来构建项目。不同之处在于，我将识别出的结果逐个存储在数组中。然后程序分析数组并对某些预先设置的单词（如“播放”或其他命令）做出响应。程序响应命令后，删除此元素。

废话少说，这是代码:

The recognizer, you can see the supportedCommands array that filter some specific words for the program to respond. The other parts are similar to the demo at https://developer.apple.com/library/content/samplecode/SpeakToMe/Listings/SpeakToMe_ViewController_swift.html#//apple_ref/doc/uid/TP40017110-SpeakToMe_ViewController_swift-DontLinkElementID_6

class SpeechRecognizer: NSObject, SFSpeechRecognizerDelegate {

    private var speechRecognizer: SFSpeechRecognizer!
    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest!
    private var recognitionTask: SFSpeechRecognitionTask!
    private let audioEngine = AVAudioEngine()
    private let locale = Locale(identifier: "en-US")

    private var lastSavedString: String = ""
    private let supportedCommands = ["more", "play"]

    var speechInputQueue: [String] = [String]()

    func load() {
        print("load")
        prepareRecognizer(locale: locale)

        authorize()
    }

    func start() {
        print("start")
        if !audioEngine.isRunning {
            try! startRecording()
        }
    }

    func stop() {
        if audioEngine.isRunning {
            audioEngine.stop()
            recognitionRequest?.endAudio()

        }
    }

    private func authorize() {
        SFSpeechRecognizer.requestAuthorization { authStatus in
            OperationQueue.main.addOperation {
                switch authStatus {
                case .authorized:
                    print("Authorized!")
                case .denied:
                    print("Unauthorized!")
                case .restricted:
                    print("Unauthorized!")
                case .notDetermined:
                    print("Unauthorized!")
                }
            }
        }
    }

    private func prepareRecognizer(locale: Locale) {
        speechRecognizer = SFSpeechRecognizer(locale: locale)!
        speechRecognizer.delegate = self
    }

    private func startRecording() throws {

        // Cancel the previous task if it's running.
        if let recognitionTask = recognitionTask {
            recognitionTask.cancel()
            self.recognitionTask = nil
        }

        let audioSession = AVAudioSession.sharedInstance()
        try audioSession.setCategory(AVAudioSessionCategoryPlayAndRecord, with: .defaultToSpeaker)
        try audioSession.setMode(AVAudioSessionModeDefault)
        try audioSession.setActive(true, with: .notifyOthersOnDeactivation)

        recognitionRequest = SFSpeechAudioBufferRecognitionRequest()

        let inputNode = audioEngine.inputNode
        guard let recognitionRequest = recognitionRequest else { fatalError("Unable to created a SFSpeechAudioBufferRecognitionRequest object") }

        // Configure request so that results are returned before audio recording is finished
        recognitionRequest.shouldReportPartialResults = true

        // A recognition task represents a speech recognition session.
        // We keep a reference to the task so that it can be cancelled.
        recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
            var isFinal = false

            if let result = result {

                let temp = result.bestTranscription.formattedString.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines).lowercased()
                //print("temp", temp)
                if temp != self.lastSavedString && temp.count > self.lastSavedString.count {

                    var tempSplit = temp.split(separator: " ")
                    var lastSplit = self.lastSavedString.split(separator: " ")
                    while lastSplit.count > 0 {
                        if String(tempSplit[0]) == String(lastSplit[0]) {
                            tempSplit.remove(at: 0)
                            lastSplit.remove(at: 0)
                        }
                        else {
                            break
                        }
                    }

                    for command in tempSplit {
                        if self.supportedCommands.contains(String(command)) {
                            self.speechInputQueue.append(String(command))
                        }
                    }
                    self.lastSavedString = temp

                }
                isFinal = result.isFinal
            }

            if error != nil || isFinal {
                self.audioEngine.stop()
                inputNode.removeTap(onBus: 0)
                self.recognitionRequest = nil
                self.recognitionTask = nil
            }
        }

        let recordingFormat = inputNode.outputFormat(forBus: 0)
        inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
            self.recognitionRequest?.append(buffer)
        }

        audioEngine.prepare()

        try audioEngine.start()

    }
}

How we use it:

    if self.speechRecognizer.speechInputQueue.count > 0 {
    if self.speechRecognizer.speechInputQueue[0] == "more" {
        print("temp", temp)
        print("content", content)
       // isSpeakingContent = true
        self.textToSpeech(text: content)
    }
    else if self.speechRecognizer.speechInputQueue[0] == "play" {
        print("try to play")
        let soundURL = URL(fileURLWithPath: Bundle.main.path(forResource: "cascade", ofType: "wav")!)

        do {
            audioPlayer = try AVAudioPlayer(contentsOf: soundURL)
        }
        catch {
            print(error)
        }
        audioPlayer.prepareToPlay()
        audioPlayer.play()
    }
    else {
        self.textToSpeech(text: "unrecognized command")
    }
    self.speechRecognizer.speechInputQueue.remove(at: 0)
    print("after :", self.speechRecognizer.speechInputQueue)
}

它会响应某些命令并播放一些音频。

缓冲区有问题吗？也许在一两分钟的识别后，缓冲区就已经满了？识别器随着时间的推移而失败。

- Jerry Chang

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- rob mayoff · Accepted Answer

来自WWDC 2016 Session 509:语音识别API:

iOS 10 的语音识别功能开始时会有严格的音频持续时间限制，大约为一分钟，与键盘听写类似。