Async batch predictions

Should partially resolve https://github.com/argmaxinc/WhisperKit/issues/97

added audio file batch processing
added concurrentWorkerCount responsible for controlling no of concurrent tasks
added signposts for better tracing
removed Transcriber protocol, made WhipserKit an open class
added TranscribeTask class responsible for transcribing audio chunk to text and moved all the logic there
added tests

This is how signposts look like in Instruments (for processing 5 audio files):

Some benchmarks on my MacBook Air M1 (running in the release mode using tiny model time swift run -c release whisperkit-cli transcribe [...]):

running on 1 file, 40:26 length -- 99.30s user, 11.07s system, 1:24.12 total
running on 5 files, 40:26 length each -- 744.40s user, 111.61s system, 2:58.57 total
running on 1 file, 0:11 length -- 0.73s user, 0.11s system, 1.535 total
running on 5 files, 0:11 length each -- 2.55s user, 0.40s system, 1.780 total

using Alice.mp3 file provided by @ZachNagengast:

running on 1 file, 12:16 length -- 32.69s user, 3.71s system, 27.981 total
running on 5 files, 12:16 length each -- 263.17s user, 39.25s system, 1:00.95 total

API changes

Deprecations

`WhisperKit`

Deprecated

public func transcribe(
    audioPath: String,
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async throws -> TranscriptionResult?

use instead

public func transcribe(
    audioPath: String,
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async throws -> [TranscriptionResult]

Deprecated

public func transcribe(
    audioArray: [Float],
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async throws -> TranscriptionResult?

use instead

public func transcribe(
    audioArray: [Float],
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async throws -> [TranscriptionResult]

`TextDecoding`

Deprecated

func decodeText(
    from encoderOutput: MLMultiArray,
    using decoderInputs: DecodingInputs,
    sampler tokenSampler: TokenSampling,
    options decoderOptions: DecodingOptions,
    callback: ((TranscriptionProgress) -> Bool?)?
) async throws -> [DecodingResult]

use instead

func decodeText(
    from encoderOutput: MLMultiArray,
    using decoderInputs: DecodingInputs,
    sampler tokenSampler: TokenSampling,
    options decoderOptions: DecodingOptions,
    callback: ((TranscriptionProgress) -> Bool?)?
) async throws -> DecodingResult

Deprecated

func detectLanguage(
    from encoderOutput: MLMultiArray,
    using decoderInputs: DecodingInputs,
    sampler tokenSampler: TokenSampling,
    options: DecodingOptions,
    temperature: FloatType
) async throws -> [DecodingResult]

use instead

func detectLanguage(
    from encoderOutput: MLMultiArray,
    using decoderInputs: DecodingInputs,
    sampler tokenSampler: TokenSampling,
    options: DecodingOptions,
    temperature: FloatType
) async throws -> DecodingResult

Breaking changes

removed Transcriber protocol

`AudioProcessing`

static func loadAudio(fromPath audioFilePath: String) -> AVAudioPCMBuffer?

becomes

static func loadAudio(fromPath audioFilePath: String) throws -> AVAudioPCMBuffer

`AudioStreamTranscriber`

public init(
    audioProcessor: any AudioProcessing, 
    transcriber: any Transcriber, 
    decodingOptions: DecodingOptions, 
    requiredSegmentsForConfirmation: Int = 2, 
    silenceThreshold: Float = 0.3, 
    compressionCheckWindow: Int = 20, 
    useVAD: Bool = true, 
    stateChangeCallback: AudioStreamTranscriberCallback?
)

becomes

public init(
    audioEncoder: any AudioEncoding,
    featureExtractor: any FeatureExtracting,
    segmentSeeker: any SegmentSeeking,
    textDecoder: any TextDecoding,
    tokenizer: any WhisperTokenizer,
    audioProcessor: any AudioProcessing,
    decodingOptions: DecodingOptions,
    requiredSegmentsForConfirmation: Int = 2,
    silenceThreshold: Float = 0.3,
    compressionCheckWindow: Int = 20,
    useVAD: Bool = true,
    stateChangeCallback: AudioStreamTranscriberCallback?
)

`TextDecoding`

func prepareDecoderInputs(withPrompt initialPrompt: [Int]) -> DecodingInputs?

becomes

func prepareDecoderInputs(withPrompt initialPrompt: [Int]) throws -> DecodingInputs

argmaxinc / WhisperKit

Async batch predictions #107

API changes

Deprecations

`WhisperKit`

`TextDecoding`

Breaking changes

`AudioProcessing`

`AudioStreamTranscriber`

`TextDecoding`