After installing packages required to run a transcription model it throws an assertion error when trying to use it
Expected behavior
The transcription model should run fine
How to reproduce the issue
Code for reproducing the problem
```py
from manim import *
from manim_voiceover import VoiceoverScene
from manim_voiceover.services.gtts import GTTSService
class BugScene(VoiceoverScene):
def construct(self):
self.set_speech_service(
GTTSService(transcription_model="base")
)
with self.voiceover("Voice") as trk:
pass
```
Description of bug / unexpected behavior
After installing packages required to run a transcription model it throws an assertion error when trying to use it
Expected behavior
The transcription model should run fine
How to reproduce the issue
Code for reproducing the problem
```py from manim import * from manim_voiceover import VoiceoverScene from manim_voiceover.services.gtts import GTTSService class BugScene(VoiceoverScene): def construct(self): self.set_speech_service( GTTSService(transcription_model="base") ) with self.voiceover("Voice") as trk: pass ```Additional media files
Images/GIFs
Logs
Terminal output
``` (venv) oz@Ozz:~/repos/GPU_Programming$ manim -pql manim_scripts/temp.py -v DEBUG Manim Community v0.18.1 Detected language: english 0%| | 0/0.96 [00:00, ?sec/s] ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/manim/cli/render/commands.py:12 │ │ 0 in render │ │ │ │ 117 │ │ │ try: │ │ 118 │ │ │ │ with tempconfig({}): │ │ 119 │ │ │ │ │ scene = SceneClass() │ │ ❱ 120 │ │ │ │ │ scene.render() │ │ 121 │ │ │ except Exception: │ │ 122 │ │ │ │ error_console.print_exception() │ │ 123 │ │ │ │ sys.exit(1) │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/manim/scene/scene.py:229 in │ │ render │ │ │ │ 226 │ │ """ │ │ 227 │ │ self.setup() │ │ 228 │ │ try: │ │ ❱ 229 │ │ │ self.construct() │ │ 230 │ │ except EndSceneEarlyException: │ │ 231 │ │ │ pass │ │ 232 │ │ except RerunSceneException as e: │ │ │ │ /home/oz/repos/GPU_Programming/manim_scripts/temp.py:39 in construct │ │ │ │ 36 │ self.set_speech_service( │ │ 37 │ │ GTTSService(transcription_model="base") │ │ 38 │ │ ) │ │ ❱ 39 │ with self.voiceover("Voice") as trk: │ │ 40 │ pass │ │ 41 │ │ │ │ /usr/lib/python3.11/contextlib.py:137 in __enter__ │ │ │ │ 134 │ │ # they are only needed for recreation, which is not possible anymore │ │ 135 │ │ del self.args, self.kwds, self.func │ │ 136 │ │ try: │ │ ❱ 137 │ │ │ return next(self.gen) │ │ 138 │ │ except StopIteration: │ │ 139 │ │ │ raise RuntimeError("generator didn't yield") from None │ │ 140 │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/manim_voiceover/voiceover_scene │ │ .py:186 in voiceover │ │ │ │ 183 │ │ │ │ 184 │ │ try: │ │ 185 │ │ │ if text is not None: │ │ ❱ 186 │ │ │ │ yield self.add_voiceover_text(text, **kwargs) │ │ 187 │ │ │ elif ssml is not None: │ │ 188 │ │ │ │ yield self.add_voiceover_ssml(ssml, **kwargs) │ │ 189 │ │ finally: │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/manim_voiceover/voiceover_scene │ │ .py:69 in add_voiceover_text │ │ │ │ 66 │ │ │ │ "You need to call init_voiceover() before adding a voiceover." │ │ 67 │ │ │ ) │ │ 68 │ │ │ │ ❱ 69 │ │ dict_ = self.speech_service._wrap_generate_from_text(text, **kwargs) │ │ 70 │ │ tracker = VoiceoverTracker(self, dict_, self.speech_service.cache_dir) │ │ 71 │ │ self.add_sound(str(Path(self.speech_service.cache_dir) / dict_["final_audio"])) │ │ 72 │ │ self.current_tracker = tracker │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/manim_voiceover/services/base.p │ │ y:95 in _wrap_generate_from_text │ │ │ │ 92 │ │ │ │ 93 │ │ # Check whether word boundaries exist and if not run stt │ │ 94 │ │ if "word_boundaries" not in dict_ and self._whisper_model is not None: │ │ ❱ 95 │ │ │ transcription_result = self._whisper_model.transcribe( │ │ 96 │ │ │ │ str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs │ │ 97 │ │ │ ) │ │ 98 │ │ │ logger.info("Transcription: " + transcription_result.text) │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/stable_whisper/whisper_word_lev │ │ el.py:575 in transcribe_stable │ │ │ │ 572 │ │ │ if word_timestamps: │ │ 573 │ │ │ │ if end_timestamp_pos > 0: │ │ 574 │ │ │ │ │ num_samples = min(round(end_timestamp_pos * N_SAMPLES_PER_TOKEN), nu │ │ ❱ 575 │ │ │ │ add_word_timestamps_stable( │ │ 576 │ │ │ │ │ segments=current_segments, │ │ 577 │ │ │ │ │ model=model, │ │ 578 │ │ │ │ │ tokenizer=tokenizer, │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/stable_whisper/timing.py:259 in │ │ add_word_timestamps_stable │ │ │ │ 256 │ │ │ │ │ ) │ │ 257 │ │ │ │ ) │ │ 258 │ │ │ ❱ 259 │ align() │ │ 260 │ if ( │ │ 261 │ │ │ gap_padding is not None and │ │ 262 │ │ │ any( │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/stable_whisper/timing.py:225 in │ │ align │ │ │ │ 222 │ │ text_tokens, token_split, seg_indices = split_word_tokens(segments, tokenizer, │ │ 223 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ padding=gap_padding, s │ │ 224 │ │ │ │ ❱ 225 │ │ alignment = find_alignment_stable(model, tokenizer, text_tokens, mel, num_sample │ │ 226 │ │ │ │ │ │ │ │ │ │ **kwargs, │ │ 227 │ │ │ │ │ │ │ │ │ │ token_split=token_split, │ │ 228 │ │ │ │ │ │ │ │ │ │ audio_features=audio_features, │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/stable_whisper/timing.py:79 in │ │ find_alignment_stable │ │ │ │ 76 │ weights = (weights * qk_scale).softmax(dim=-1) │ │ 77 │ std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False) │ │ 78 │ weights = (weights - mean) / std │ │ ❱ 79 │ weights = median_filter(weights, medfilt_width) │ │ 80 │ │ │ 81 │ matrix = weights.mean(axis=0) │ │ 82 │ matrix = matrix[len(tokenizer.sot_sequence): -1] │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/whisper/timing.py:38 in │ │ median_filter │ │ │ │ 35 │ x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect") │ │ 36 │ if x.is_cuda: │ │ 37 │ │ try: │ │ ❱ 38 │ │ │ from .triton_ops import median_filter_cuda │ │ 39 │ │ │ │ │ 40 │ │ │ result = median_filter_cuda(x, filter_width) │ │ 41 │ │ except (RuntimeError, subprocess.CalledProcessError): │ │ │ │ /home/oz/repos/GPU_Programming/venv/lib/python3.11/site-packages/whisper/triton_ops.py:7 in │ │System specifications
System Details
- OS (with version, e.g., Windows 10 v2004 or macOS 10.15 (Catalina)): - RAM: - Python version (`python/py/python3 --version`): - Installed modules (provide output from `pip list`): ``` Debian 12 kernel 6.1.0-22-amd64 ram: 64 GB DDR5 Python 3.11.2 Pip: Package Version ------------------------ ----------- attrs 23.2.0 basedpyright 1.13.3 cattrs 23.2.3 certifi 2024.7.4 charset-normalizer 3.3.2 click 8.1.7 cloup 3.0.5 cmake 3.30.1 decorator 5.1.1 docstring-to-markdown 0.15 evdev 1.7.1 ffmpeg-python 0.2.0 filelock 3.15.4 fsspec 2024.6.1 future 1.0.0 glcontext 2.5.0 gTTS 2.5.1 huggingface-hub 0.24.1 idna 3.7 isosurfaces 0.1.2 jedi 0.19.1 jedi-language-server 0.41.4 Jinja2 3.1.4 lit 18.1.8 llvmlite 0.43.0 lsprotocol 2023.0.1 manim 0.18.1 manim-ml 0.0.24 manim-voiceover 0.3.6.post0 ManimPango 0.5.0 mapbox-earcut 1.0.1 markdown-it-py 3.0.0 MarkupSafe 2.1.5 mdurl 0.1.2 moderngl 5.10.0 moderngl-window 2.4.6 more-itertools 10.3.0 mpmath 1.3.0 multipledispatch 1.0.0 mutagen 1.47.0 networkx 3.3 nodejs-wheel-binaries 20.15.1 numba 0.60.0 numpy 1.26.4 nvidia-cublas-cu11 11.10.3.66 nvidia-cuda-cupti-cu11 11.7.101 nvidia-cuda-nvrtc-cu11 11.7.99 nvidia-cuda-runtime-cu11 11.7.99 nvidia-cudnn-cu11 8.5.0.96 nvidia-cufft-cu11 10.9.0.58 nvidia-curand-cu11 10.2.10.91 nvidia-cusolver-cu11 11.4.0.1 nvidia-cusparse-cu11 11.7.4.91 nvidia-nccl-cu11 2.14.3 nvidia-nvtx-cu11 11.7.91 openai-whisper 20230314 packaging 24.1 pandas 2.2.2 parso 0.8.4 pillow 10.4.0 pip 23.0.1 PyAudio 0.2.14 pycairo 1.26.1 pydub 0.25.1 pyglet 2.0.15 pygls 1.3.1 Pygments 2.18.0 pynput 1.7.7 pyrr 0.10.3 python-dateutil 2.9.0.post0 python-dotenv 0.21.1 python-slugify 8.0.4 python-xlib 0.33 pytz 2024.1 PyYAML 6.0.1 regex 2024.5.15 requests 2.32.3 rich 13.7.1 safetensors 0.4.3 scipy 1.14.0 screeninfo 0.8.1 setuptools 66.1.1 six 1.16.0 skia-pathops 0.8.0.post1 sox 1.5.0 srt 3.5.3 stable-ts 2.11.1 svgelements 1.9.6 sympy 1.13.1 text-unidecode 1.3 tiktoken 0.3.1 tokenizers 0.19.1 torch 2.0.1 torchaudio 2.0.2 tqdm 4.66.4 transformers 4.43.1 triton 2.0.0 typing_extensions 4.12.2 tzdata 2024.1 urllib3 2.2.2 watchdog 4.0.1 wheel 0.43.0 ```LaTeX details
+ LaTeX distribution (e.g. TeX Live 2020): + Installed LaTeX packages:FFMPEG
Output of `ffmpeg -version`: ``` PASTE HERE ```Additional comments