KWS with Swift Package Manager

Hello, I'm using sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01.tar.bz2 with https://github.com/microsoft/onnxruntime-swift-package-manager. I successfully encode the data, as I get non-zero buffers from encoderOutputValue.tensorData(). I then pass that data as the input to my decoder ORTValue(). Unfortunately, I always see "decoder output: {length = 0, bytes = 0x}" as my decoder output. I'm also a bit confused as to how the tokens, keywords, etc. are loaded when using the swift package manager version. Any help is appreciated, thank you. Update a few hours later: I realize that the decoder was returning zero because the buffer shape was not created properly. The shapes should be: Input tensor name: y, Shape: [0, 2]; Output tensor name: decoder_out, Shape: [0, 320]. However, when I was setting the NSNumber at index zero to zero, the buffer was created to be zero length making the decoder always 0. I saw this when I had verbose output on: "2024-09-09 15:51:20.992418 [V:onnxruntime:, bfc_arena.cc:317 AllocateRawInternal] tried to allocate 0 bytes". I realized that 0 indicates dynamic. Unfortunately, setting index 0 of the shape to "encoderOutputValue.tensorTypeAndShapeInfo().shape[0].intValue" makes the following errors: "2024-09-09 16:23:56.360281 [E:onnxruntime:, sequential_executor.cc:516 ExecuteKernel] Non-zero status code returned while running Gather node. Name:'/decoder/embedding/Gather' Status Message: indices element out of data bounds, idx=4374855830894280704 must be within the inclusive range [-500,499]" and "For ort_value with index: 2, block in memory pattern size is: 20480 but the actual size is: 1280, fall back to default allocation behavior". I also have other questions -- How can I tell which keyword was spoken? When say the keywords the values don't seem to deviate from the regular noise? An example project for KWS using the swift package manager would be super helpful.

import OnnxRuntimeBindings
//import OnnxRuntimeExtensions

class KWSModelInference {
    var encoderSession: ORTSession?
    var decoderSession: ORTSession?
    var joinerSession: ORTSession?
    var env: ORTEnv?

    var tokens: [String] = []
    var keywords: [String] = []

    init() {
        do {
            // Initialize the ONNX Runtime environment
            env = try ORTEnv(loggingLevel: .verbose)
        } catch {
            print("Failed to create ONNX Runtime environment: \(error)")
            return
        }

        // Load tokens and keywords
//        loadTokens()
        loadKeywords()

        // Initialize the encoder session
        if let encoderModelPath = Bundle.main.path(forResource: "encoder-epoch-12-avg-2-chunk-16-left-64", ofType: "onnx") {
            do {
                let encoderOptions = try ORTSessionOptions()
                encoderSession = try ORTSession(env: env!, modelPath: encoderModelPath, sessionOptions: encoderOptions)
            } catch {
                print("Failed to create ONNX Runtime encoder session: \(error)")
            }
        } else {
            print("Encoder model file not found")
        }

        // Initialize the decoder session
        if let decoderModelPath = Bundle.main.path(forResource: "decoder-epoch-12-avg-2-chunk-16-left-64", ofType: "onnx") {
            do {
                let decoderOptions = try ORTSessionOptions()
                decoderSession = try ORTSession(env: env!, modelPath: decoderModelPath, sessionOptions: decoderOptions)
            } catch {
                print("Failed to create ONNX Runtime decoder session: \(error)")
            }
        } else {
            print("Decoder model file not found")
        }

        // Optionally initialize the joiner session if needed
        if let joinerModelPath = Bundle.main.path(forResource: "joiner-epoch-12-avg-2-chunk-16-left-64", ofType: "onnx") {
            do {
                let joinerOptions = try ORTSessionOptions()
                joinerSession = try ORTSession(env: env!, modelPath: joinerModelPath, sessionOptions: joinerOptions)
            } catch {
                print("Failed to create ONNX Runtime joiner session: \(error)")
            }
        } else {
            print("Joiner model file not found")
        }
    }

    func loadTokens() {
        if let tokensPath = Bundle.main.path(forResource: "tokens", ofType: "txt") {
            do {
                let tokensString = try String(contentsOfFile: tokensPath)
                tokens = tokensString.components(separatedBy: .newlines).filter { !$0.isEmpty }
            } catch {
                print("Failed to load tokens file: \(error)")
            }
        } else {
            print("Tokens file not found")
        }
    }

    func loadKeywords() {
        if let keywordsPath = Bundle.main.path(forResource: "keywords", ofType: "txt") {
            do {
                let keywordsString = try String(contentsOfFile: keywordsPath)
                keywords = keywordsString.components(separatedBy: .newlines).filter { !$0.isEmpty }
            } catch {
                print("Failed to load keywords file: \(error)")
            }
        } else {
            print("Keywords file not found")
        }
    }

    func prepareInputData(inputData: [Float], height: Int, width: Int, channels: Int) -> NSMutableData {
        let batchSize = 1 // Example batch size
        let expectedSize = batchSize * height * width * channels

        // Ensure data is properly sized
        var paddedData = inputData
        if paddedData.count < expectedSize {
            paddedData.append(contentsOf: [Float](repeating: 0, count: expectedSize - paddedData.count))
        } else if paddedData.count > expectedSize {
            paddedData = Array(paddedData.prefix(expectedSize))
        }

        return NSMutableData(bytes: paddedData, length: paddedData.count * MemoryLayout<Float>.size)
    }

    func runInference(inputData: [Float], height: Int, width: Int) {
        guard let encoderSession = encoderSession else {
            print("Encoder session is not initialized")
            return
        }

        guard let decoderSession = decoderSession else {
            print("Decoder session is not initialized")
            return
        }
//        print("Input to Inference: \(inputData)")

        do {
            // Prepare input tensor for 'x'
            let inputDataData = prepareInputData(inputData: inputData, height: height, width: width, channels: 1)
//            let xShape: [NSNumber] = [1, NSNumber(value: height), NSNumber(value: width)]
//            let inputDataValues: [Float] = Array(repeating: 0.0, count: 1 * 45 * 80) // Adjust values as needed
//            let inputData = NSMutableData(bytes: inputDataValues, length: inputDataValues.count * MemoryLayout<Float>.size)
            let xShape: [NSNumber] = [NSNumber(value: 1), NSNumber(value: height), NSNumber(value: width)]
            let xTensor = try ORTValue(tensorData: inputDataData, elementType: .float, shape: xShape)

            // Prepare other tensors with corrected shapes
            let embedStatesValues: [Float] = Array(repeating: 0.0, count: 128 * 3 * 19) // Adjust values as needed
            let embedStatesData = NSMutableData(bytes: embedStatesValues, length: embedStatesValues.count * MemoryLayout<Float>.size)
//            let embedStatesData = prepareInputData(inputData: inputData, height: 128, width: 3, channels: 19)
            let embedStatesShape: [NSNumber] = [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 3), NSNumber(value: 19)]
            let embedStatesTensor = try ORTValue(tensorData: embedStatesData, elementType: .float, shape: embedStatesShape)

            let processedLensValues: [Int64] = [1] // Example value, adjust based on actual requirements
            let processedLensSize = processedLensValues.count
            let processedLensData = NSMutableData(bytes: processedLensValues, length: processedLensSize * MemoryLayout<Int64>.size)
            let processedLensShape: [NSNumber] = [NSNumber(value: processedLensSize)]
            let processedLensTensor = try ORTValue(tensorData: processedLensData, elementType: .int64, shape: processedLensShape)

            // Define input and output names
            let encoderInputNames = ["x", "embed_states", "processed_lens", "cached_key_0", "cached_nonlin_attn_0", "cached_val1_0", "cached_val2_0", "cached_conv1_0", "cached_conv2_0", "cached_key_1", "cached_nonlin_attn_1", "cached_val1_1", "cached_val2_1", "cached_conv1_1", "cached_conv2_1", "cached_key_2", "cached_nonlin_attn_2", "cached_val1_2", "cached_val2_2", "cached_conv1_2", "cached_conv2_2", "cached_key_3", "cached_nonlin_attn_3", "cached_val1_3", "cached_val2_3", "cached_conv1_3", "cached_conv2_3", "cached_key_4", "cached_nonlin_attn_4", "cached_val1_4", "cached_val2_4", "cached_conv1_4", "cached_conv2_4", "cached_key_5", "cached_nonlin_attn_5", "cached_val1_5", "cached_val2_5", "cached_conv1_5", "cached_conv2_5"] // Include the name of the missing input
            let encoderOutputNames: Set<String> = ["encoder_out", "new_embed_states", "new_processed_lens", "new_cached_key_0", "new_cached_nonlin_attn_0", "new_cached_val1_0", "new_cached_val2_0", "new_cached_conv1_0", "new_cached_conv2_0", "new_cached_key_1", "new_cached_nonlin_attn_1", "new_cached_val1_1", "new_cached_val2_1", "new_cached_conv1_1", "new_cached_conv2_1", "new_cached_key_2", "new_cached_nonlin_attn_2", "new_cached_val1_2", "new_cached_val2_2", "new_cached_conv1_2", "new_cached_conv2_2", "new_cached_key_3", "new_cached_nonlin_attn_3", "new_cached_val1_3", "new_cached_val2_3", "new_cached_conv1_3", "new_cached_conv2_3", "new_cached_key_4", "new_cached_nonlin_attn_4", "new_cached_val1_4", "new_cached_val2_4", "new_cached_conv1_4", "new_cached_conv2_4", "new_cached_key_5", "new_cached_nonlin_attn_5", "new_cached_val1_5", "new_cached_val2_5", "new_cached_conv1_5", "new_cached_conv2_5"] // Correct output names from encoder

            // Create a dictionary to store all tensor variables
            var tensorDict = [String: ORTValue]()

            // Function to create and add tensors to the dictionary
            func addTensorToDict(index: Int, prefix: String, shape: [NSNumber], count: Int) throws {
                let values: [Float] = Array(repeating: 0.0, count: count)
                let data = NSMutableData(bytes: values, length: values.count * MemoryLayout<Float>.size)
                let tensor = try ORTValue(tensorData: data, elementType: .float, shape: shape)
                tensorDict["\(prefix)_\(index)"] = tensor
            }

            // Handle tensors individually for each index
            do {
                // Index 0
                try addTensorToDict(index: 0, prefix: "cachedKey", shape: [NSNumber(value: 64), NSNumber(value: 1), NSNumber(value: 128)], count: 64 * 1 * 128)
                try addTensorToDict(index: 0, prefix: "cachedNonlinAttn", shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: 64), NSNumber(value: 96)], count: 64 * 96)
                try addTensorToDict(index: 0, prefix: "cachedVal1", shape: [NSNumber(value: 64), NSNumber(value: 1), NSNumber(value: 48)], count: 64 * 1 * 48)
                try addTensorToDict(index: 0, prefix: "cachedVal2", shape: [NSNumber(value: 64), NSNumber(value: 1), NSNumber(value: 48)], count: 64 * 1 * 48)
                try addTensorToDict(index: 0, prefix: "cachedConv1", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 15)], count: 128 * 15)
                try addTensorToDict(index: 0, prefix: "cachedConv2", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 15)], count: 128 * 15)

                // Index 1
                try addTensorToDict(index: 1, prefix: "cachedKey", shape: [NSNumber(value: 32), NSNumber(value: 1), NSNumber(value: 128)], count: 32 * 1 * 128)
                try addTensorToDict(index: 1, prefix: "cachedNonlinAttn", shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: 32), NSNumber(value: 96)], count: 32 * 96)
                try addTensorToDict(index: 1, prefix: "cachedVal1", shape: [NSNumber(value: 32), NSNumber(value: 1), NSNumber(value: 48)], count: 32 * 1 * 48)
                try addTensorToDict(index: 1, prefix: "cachedVal2", shape: [NSNumber(value: 32), NSNumber(value: 1), NSNumber(value: 48)], count: 32 * 1 * 48)
                try addTensorToDict(index: 1, prefix: "cachedConv1", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 15)], count: 128 * 15)
                try addTensorToDict(index: 1, prefix: "cachedConv2", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 15)], count: 128 * 15)

                // Index 2
                try addTensorToDict(index: 2, prefix: "cachedKey", shape: [NSNumber(value: 16), NSNumber(value: 1), NSNumber(value: 128)], count: 16 * 1 * 128)
                try addTensorToDict(index: 2, prefix: "cachedNonlinAttn", shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: 16), NSNumber(value: 96)], count: 16 * 96)
                try addTensorToDict(index: 2, prefix: "cachedVal1", shape: [NSNumber(value: 16), NSNumber(value: 1), NSNumber(value: 48)], count: 16 * 1 * 48)
                try addTensorToDict(index: 2, prefix: "cachedVal2", shape: [NSNumber(value: 16), NSNumber(value: 1), NSNumber(value: 48)], count: 16 * 1 * 48)
                try addTensorToDict(index: 2, prefix: "cachedConv1", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 7)], count: 128 * 7)
                try addTensorToDict(index: 2, prefix: "cachedConv2", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 7)], count: 128 * 7)

                // Index 3
                try addTensorToDict(index: 3, prefix: "cachedKey", shape: [NSNumber(value: 8), NSNumber(value: 1), NSNumber(value: 256)], count: 8 * 1 * 256)
                try addTensorToDict(index: 3, prefix: "cachedNonlinAttn", shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: 8), NSNumber(value: 96)], count: 8 * 96)
                try addTensorToDict(index: 3, prefix: "cachedVal1", shape: [NSNumber(value: 8), NSNumber(value: 1), NSNumber(value: 96)], count: 8 * 1 * 96)
                try addTensorToDict(index: 3, prefix: "cachedVal2", shape: [NSNumber(value: 8), NSNumber(value: 1), NSNumber(value: 96)], count: 8 * 1 * 96)
                try addTensorToDict(index: 3, prefix: "cachedConv1", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 7)], count: 128 * 7)
                try addTensorToDict(index: 3, prefix: "cachedConv2", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 7)], count: 128 * 7)

                // Index 4
                try addTensorToDict(index: 4, prefix: "cachedKey", shape: [NSNumber(value: 16), NSNumber(value: 1), NSNumber(value: 128)], count: 16 * 1 * 128)
                try addTensorToDict(index: 4, prefix: "cachedNonlinAttn", shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: 16), NSNumber(value: 96)], count: 16 * 96)
                try addTensorToDict(index: 4, prefix: "cachedVal1", shape: [NSNumber(value: 16), NSNumber(value: 1), NSNumber(value: 48)], count: 16 * 1 * 48)
                try addTensorToDict(index: 4, prefix: "cachedVal2", shape: [NSNumber(value: 16), NSNumber(value: 1), NSNumber(value: 48)], count: 16 * 1 * 48)
                try addTensorToDict(index: 4, prefix: "cachedConv1", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 7)], count: 128 * 7)
                try addTensorToDict(index: 4, prefix: "cachedConv2", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 7)], count: 128 * 7)

                // Index 5
                try addTensorToDict(index: 5, prefix: "cachedKey", shape: [NSNumber(value: 32), NSNumber(value: 1), NSNumber(value: 128)], count: 32 * 1 * 128)
                try addTensorToDict(index: 5, prefix: "cachedNonlinAttn", shape: [NSNumber(value: 1), NSNumber(value: 1), NSNumber(value: 32), NSNumber(value: 96)], count: 32 * 96)
                try addTensorToDict(index: 5, prefix: "cachedVal1", shape: [NSNumber(value: 32), NSNumber(value: 1), NSNumber(value: 48)], count: 32 * 1 * 48)
                try addTensorToDict(index: 5, prefix: "cachedVal2", shape: [NSNumber(value: 32), NSNumber(value: 1), NSNumber(value: 48)], count: 32 * 1 * 48)
                try addTensorToDict(index: 5, prefix: "cachedConv1", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 15)], count: 128 * 15)
                try addTensorToDict(index: 5, prefix: "cachedConv2", shape: [NSNumber(value: 1), NSNumber(value: 128), NSNumber(value: 15)], count: 128 * 15)

            } catch {
                print("Error creating tensors: \(error)")
            }

            // Create the inputs dictionary
            let encoderInputs = [
                encoderInputNames[0]: xTensor,
                encoderInputNames[1]: embedStatesTensor,
                encoderInputNames[2]: processedLensTensor,
                encoderInputNames[3]: tensorDict["cachedKey_0"]!,
                encoderInputNames[4]: tensorDict["cachedNonlinAttn_0"]!,
                encoderInputNames[5]: tensorDict["cachedVal1_0"]!,
                encoderInputNames[6]: tensorDict["cachedVal2_0"]!,
                encoderInputNames[7]: tensorDict["cachedConv1_0"]!,
                encoderInputNames[8]: tensorDict["cachedConv2_0"]!,

                // Additional indices (1 through 5)
                encoderInputNames[9]: tensorDict["cachedKey_1"]!,
                encoderInputNames[10]: tensorDict["cachedNonlinAttn_1"]!,
                encoderInputNames[11]: tensorDict["cachedVal1_1"]!,
                encoderInputNames[12]: tensorDict["cachedVal2_1"]!,
                encoderInputNames[13]: tensorDict["cachedConv1_1"]!,
                encoderInputNames[14]: tensorDict["cachedConv2_1"]!,

                encoderInputNames[15]: tensorDict["cachedKey_2"]!,
                encoderInputNames[16]: tensorDict["cachedNonlinAttn_2"]!,
                encoderInputNames[17]: tensorDict["cachedVal1_2"]!,
                encoderInputNames[18]: tensorDict["cachedVal2_2"]!,
                encoderInputNames[19]: tensorDict["cachedConv1_2"]!,
                encoderInputNames[20]: tensorDict["cachedConv2_2"]!,

                encoderInputNames[21]: tensorDict["cachedKey_3"]!,
                encoderInputNames[22]: tensorDict["cachedNonlinAttn_3"]!,
                encoderInputNames[23]: tensorDict["cachedVal1_3"]!,
                encoderInputNames[24]: tensorDict["cachedVal2_3"]!,
                encoderInputNames[25]: tensorDict["cachedConv1_3"]!,
                encoderInputNames[26]: tensorDict["cachedConv2_3"]!,

                encoderInputNames[27]: tensorDict["cachedKey_4"]!,
                encoderInputNames[28]: tensorDict["cachedNonlinAttn_4"]!,
                encoderInputNames[29]: tensorDict["cachedVal1_4"]!,
                encoderInputNames[30]: tensorDict["cachedVal2_4"]!,
                encoderInputNames[31]: tensorDict["cachedConv1_4"]!,
                encoderInputNames[32]: tensorDict["cachedConv2_4"]!,

                encoderInputNames[33]: tensorDict["cachedKey_5"]!,
                encoderInputNames[34]: tensorDict["cachedNonlinAttn_5"]!,
                encoderInputNames[35]: tensorDict["cachedVal1_5"]!,
                encoderInputNames[36]: tensorDict["cachedVal2_5"]!,
                encoderInputNames[37]: tensorDict["cachedConv1_5"]!,
                encoderInputNames[38]: tensorDict["cachedConv2_5"]!
            ]

            // Run the session with the inputs
            let encoderResults = try encoderSession.run(withInputs: encoderInputs, outputNames: encoderOutputNames, runOptions: nil)

            // After running the encoder session
            guard let encoderOutputName = encoderOutputNames.first,
                  let encoderOutputValue = encoderResults[encoderOutputName] else {
                print("Failed to get encoder output value")
                return
            }

            do {

                // Convert the output from ORTValue to Data
                let encoderOutputData = try encoderOutputValue.tensorData()
//                print("encoder output: \(encoderOutputData)")
                let batchSize = try encoderOutputValue.tensorTypeAndShapeInfo().shape[0].intValue

                // Convert encoder output to ORTValue for the decoder
                let decoderShape = [NSNumber(value: batchSize), NSNumber(value: 2)]
                let decoderInputValue = try ORTValue(tensorData: encoderOutputData, elementType: .int64, shape: decoderShape)

                // Convert encoder output to ORTValue for the decoder
                let decoderInputNames = ["y"] // Correct input name for decoder
                let decoderOutputNames: Set<String> = ["decoder_out"] // Correct output name from decoder

                // Run the decoder session
                let decoderResults = try decoderSession.run(withInputs: [decoderInputNames[0]: decoderInputValue], outputNames: decoderOutputNames, runOptions: nil)
                guard let decoderOutputName = decoderOutputNames.first,
                      let decoderOutputValue = decoderResults[decoderOutputName] else {
                    print("Failed to get decoder output value")
                    return
                }

                let decoderOutputData = try decoderOutputValue.tensorData()
//                print("decoder output: \(decoderOutputData)")

                // Optionally, use the joiner model
                if let joinerSession = joinerSession {
                    let joinerInputData = decoderOutputData
                    let joinerShape = [NSNumber(value: batchSize), NSNumber(value: 320)]
                    let joinerInputValue = try ORTValue(tensorData: joinerInputData, elementType: .float, shape: joinerShape)
                    let joinerInputValue2 = try ORTValue(tensorData: joinerInputData, elementType: .float, shape: joinerShape)
                    let joinerInputNames = ["encoder_out", "decoder_out"] // Correct input name for joiner
                    let joinerOutputNames: Set<String> = ["logit"] // Correct output name from joiner

                    let joinerResults = try joinerSession.run(withInputs: [joinerInputNames[0]: joinerInputValue, joinerInputNames[1]: joinerInputValue2], outputNames: joinerOutputNames, runOptions: nil)

                    // Extract the joiner output
                    guard let joinerOutputName = joinerOutputNames.first,
                          let joinerOutputValue = joinerResults[joinerOutputName] else {
                        print("Failed to get joiner output value")
                        return
                    }

                    // Extract data from ORTValue
                    let joinerOutputData = try joinerOutputValue.tensorData()

                    // Process the joiner output to detect the keyword
                    if isKeywordDetected(data: joinerOutputData) {
                        print("Keyword detected!")
                    } else {
                        print("Keyword not detected.")
                    }
                } else {
                    print("Joiner session is not initialized")
                }

                // Proceed with decoder and joiner as usual
            } catch {
                print("Failed to process: \(error)")
            }

        } catch {
            print("Failed to run inference: \(error)")
        }
    }

    func isKeywordDetected(data: NSMutableData) -> Bool {
        // Convert NSMutableData to Data
        let dataAsData = data as Data

        // Convert Data to array of floats
        let outputArray = dataAsData.withUnsafeBytes { (pointer: UnsafeRawBufferPointer) -> [Float] in
            let floatPointer = pointer.baseAddress!.assumingMemoryBound(to: Float.self)
            return Array(UnsafeBufferPointer(start: floatPointer, count: dataAsData.count / MemoryLayout<Float>.size))
        }

        // Process the output data to determine if a keyword is detected
        // For example, you might check if the output matches any keyword
        // This is a placeholder example and should be adapted to your specific use case

//        print("Output Array: \(outputArray)")

//        // Example check: If the first value is above a certain threshold, we detect the keyword
//        print(outputArray.first ?? 0.0)
//        if let firstValue = outputArray.first, firstValue > 0.5 {
//            return true

        // Check if the output matches any keyword (example logic)
        for keyword in keywords {
            // This example assumes that keyword matching is done by comparing output values with some threshold
            // You might need to adapt this part based on your actual keyword detection logic
            if outputArray.contains(where: { $0 > 0.5 }) { // Adjust the condition based on your actual model output
                print("Detected keyword: \(keyword)")
                return true
            }
        }

        return false
    }

}
k2-fsa / sherpa-onnx

KWS with Swift Package Manager #1334