Open eschmidbauer opened 4 weeks ago
Which model format you need?
im looking to try CPU
Download the models and set the path. Next, run the following python script to get the inference results.
import sys
import gc
import re
import time
import math
import torch
import torchaudio
import jieba
from pypinyin import lazy_pinyin, Style
import numpy as np
import onnxruntime
F5_project_path = "/Users/dake/Downloads/F5-TTS-main" # The F5-TTS Github project download path.
onnx_model_A = "/Users/dake/Downloads/F5_ONNX/F5_Preprocess.ort" # The exported onnx model path.
onnx_model_B = "/Users/dake/Downloads/F5_ONNX/F5_Transformer.ort" # The exported onnx model path.
onnx_model_C = "/Users/dake/Downloads/F5_ONNX/F5_Decode.ort" # The exported onnx model path.
reference_audio = "/Users/dake/Downloads/F5-TTS-main/tests/ref_audio/test_en_1_ref_short.wav" # The reference audio path.
generated_audio = "/Users/dake/Downloads/F5-TTS-main/tests/generated.wav" # The generated audio path.
ref_text = "Some call me nature, others call me mother nature." # The ASR result of reference audio.
gen_text = "Hello, How are you? I am a super hero on Earth." # The target TTS.
ORT_Accelerate_Providers = [] # If you have accelerate devices for : ['CUDAExecutionProvider', 'TensorrtExecutionProvider', 'CoreMLExecutionProvider', 'DmlExecutionProvider', 'OpenVINOExecutionProvider', 'ROCMExecutionProvider', 'MIGraphXExecutionProvider', 'AzureExecutionProvider']
# else keep empty.
RANDOM_SEED = 9527
NFE_STEP = 32
HOP_LENGTH = 256
SPEED = 1.0
SAMPLE_RATE = 24000
with open(f"{F5_project_path}/data/Emilia_ZH_EN_pinyin/vocab.txt", "r", encoding="utf-8") as f:
vocab_char_map = {}
for i, char in enumerate(f):
vocab_char_map[char[:-1]] = i
vocab_size = len(vocab_char_map)
def is_chinese_char(c):
cp = ord(c)
return (
0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
0x3400 <= cp <= 0x4DBF or # CJK Unified Ideographs Extension A
0x20000 <= cp <= 0x2A6DF or # CJK Unified Ideographs Extension B
0x2A700 <= cp <= 0x2B73F or # CJK Unified Ideographs Extension C
0x2B740 <= cp <= 0x2B81F or # CJK Unified Ideographs Extension D
0x2B820 <= cp <= 0x2CEAF or # CJK Unified Ideographs Extension E
0xF900 <= cp <= 0xFAFF or # CJK Compatibility Ideographs
0x2F800 <= cp <= 0x2FA1F # CJK Compatibility Ideographs Supplement
)
def convert_char_to_pinyin(text_list, polyphone=True):
final_text_list = []
merged_trans = str.maketrans({
'“': '"', '”': '"', '‘': "'", '’': "'",
';': ','
})
chinese_punctuations = set("。,、;:?!《》【】—…")
for text in text_list:
char_list = []
text = text.translate(merged_trans)
for seg in jieba.cut(text):
if seg.isascii():
if char_list and len(seg) > 1 and char_list[-1] not in " :'\"":
char_list.append(" ")
char_list.extend(seg)
elif polyphone and all(is_chinese_char(c) for c in seg):
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
for c in pinyin_list:
if c not in chinese_punctuations:
char_list.append(" ")
char_list.append(c)
else:
for c in seg:
if c.isascii():
char_list.append(c)
elif c in chinese_punctuations:
char_list.append(c)
else:
char_list.append(" ")
pinyin = lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
char_list.extend(pinyin)
final_text_list.append(char_list)
return final_text_list
def list_str_to_idx(
text: list[str] | list[list[str]],
vocab_char_map: dict[str, int], # {char: idx}
padding_value=-1
):
get_idx = vocab_char_map.get
list_idx_tensors = [torch.tensor([get_idx(c, 0) for c in t], dtype=torch.int32) for t in text]
text = torch.nn.utils.rnn.pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
return text
# ONNX Runtime settings
onnxruntime.set_seed(RANDOM_SEED)
session_opts = onnxruntime.SessionOptions()
session_opts.log_severity_level = 3 # error level, it a adjustable value.
session_opts.inter_op_num_threads = 0 # Run different nodes with num_threads. Set 0 for auto.
session_opts.intra_op_num_threads = 0 # Under the node, execute the operators with num_threads. Set 0 for auto.
session_opts.enable_cpu_mem_arena = True # True for execute speed; False for less memory usage.
session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_session_A = onnxruntime.InferenceSession(onnx_model_A, sess_options=session_opts, providers=['CPUExecutionProvider'])
in_name_A = ort_session_A.get_inputs()
out_name_A = ort_session_A.get_outputs()
in_name_A0 = in_name_A[0].name
in_name_A1 = in_name_A[1].name
in_name_A2 = in_name_A[2].name
out_name_A0 = out_name_A[0].name
out_name_A1 = out_name_A[1].name
out_name_A2 = out_name_A[2].name
out_name_A3 = out_name_A[3].name
out_name_A4 = out_name_A[4].name
out_name_A5 = out_name_A[5].name
out_name_A6 = out_name_A[6].name
ort_session_B = onnxruntime.InferenceSession(onnx_model_B, sess_options=session_opts, providers=ORT_Accelerate_Providers.append('CPUExecutionProvider'))
in_name_B = ort_session_B.get_inputs()
out_name_B = ort_session_B.get_outputs()
in_name_B0 = in_name_B[0].name
in_name_B1 = in_name_B[1].name
in_name_B2 = in_name_B[2].name
in_name_B3 = in_name_B[3].name
in_name_B4 = in_name_B[4].name
in_name_B5 = in_name_B[5].name
in_name_B6 = in_name_B[6].name
out_name_B0 = out_name_B[0].name
ort_session_C = onnxruntime.InferenceSession(onnx_model_C, sess_options=session_opts, providers=['CPUExecutionProvider'])
in_name_C = ort_session_C.get_inputs()
out_name_C = ort_session_C.get_outputs()
in_name_C0 = in_name_C[0].name
in_name_C1 = in_name_C[1].name
out_name_C0 = out_name_C[0].name
# Run F5-TTS by ONNX Runtime
audio, sr = torchaudio.load(reference_audio)
if sr != SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
audio = resampler(audio)
audio = audio.unsqueeze(0).numpy()
zh_pause_punc = r"。,、;:?!"
ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
ref_audio_len = audio.shape[-1] // HOP_LENGTH + 1
max_duration = np.array(ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / SPEED), dtype=np.int64)
gen_text = convert_char_to_pinyin([ref_text + gen_text])
text_ids = list_str_to_idx(gen_text, vocab_char_map).numpy()
time_step = np.array(0, dtype=np.int32)
print("\n\nRun F5-TTS by ONNX Runtime.")
start_count = time.time()
noise, rope_cos, rope_sin, cat_mel_text, cat_mel_text_drop, qk_rotated_empty, ref_signal_len = ort_session_A.run(
[out_name_A0, out_name_A1, out_name_A2, out_name_A3, out_name_A4, out_name_A5, out_name_A6],
{
in_name_A0: audio,
in_name_A1: text_ids,
in_name_A2: max_duration
})
while time_step < NFE_STEP:
print(f"NFE_STEP: {time_step}")
noise = ort_session_B.run(
[out_name_B0],
{
in_name_B0: noise,
in_name_B1: rope_cos,
in_name_B2: rope_sin,
in_name_B3: cat_mel_text,
in_name_B4: cat_mel_text_drop,
in_name_B5: qk_rotated_empty,
in_name_B6: time_step
})[0]
time_step += 1
generated_signal = ort_session_C.run(
[out_name_C0],
{
in_name_C0: noise,
in_name_C1: ref_signal_len
})[0]
end_count = time.time()
# Save to audio
audio_tensor = torch.tensor(generated_signal).squeeze(0)
torchaudio.save(generated_audio, audio_tensor, SAMPLE_RATE)
print(f"\nAudio generation is complete.\n\nONNXRuntime Time Cost in Seconds:\n{end_count - start_count:.3f}")
Can you make the models public?
Try again~
The Inference script was edited. Make sure to copy the latest one. : )
getting an error with the script:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "onnxruntime/capi/onnxruntime_inference_collection.py", line 220, in run
return self._sess.run(output_names, input_feed, run_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
onnxruntime.capi.onnxruntime_pybind11_state.InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: audio for the following indices
index: 1 Got: 2 Expected: 1
Please fix either the inputs/outputs or the model.
errr nevermind, the audio file is not formatted correctly ... sorry
i added this to handle >1 channel in wav
audio, sr = torchaudio.load(reference_audio)
if sr != SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE) # noqa
audio = resampler(audio)
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
im just getting noise in generated.wav
do we need to use vocos to generate the final wav?
All process including STFT, Vocos and ISTFT were converted in ONNX operators. Directly run the three models should work. We use the ONNXRuntime==1.19.2 for running and work well.
The valid shape for audio is (1, 1, audio_len). Hence, make sure the dim setting in torch.mean() to keep the valid shape as input.
thanks, ill check ONNXRuntime version
Could you post the GPU one as well? @DakeQQ
@rachelbeeson Hello~ The person in charge of F5-TTS-ONNX is currently traveling on business. If you are willing to wait, we will provide you with a download link for F5-TTS-ONNX-Float16 on November 14 (+8 time zone). If you cannot wait, you might also consider exporting the model yourself in Float32 format and then quantizing it to Float16 for GPU device.
@rachelbeeson Hello~ The person in charge of F5-TTS-ONNX is currently traveling on business. If you are willing to wait, we will provide you with a download link for F5-TTS-ONNX-Float16 on November 14 (+8 time zone). If you cannot wait, you might also consider exporting the model yourself in Float32 format and then quantizing it to Float16 for GPU device.
No problem, thank you for the hint. I did try as you say (export and then change precision) but I found running the exported model on ONNX was slower than running the original on torch for GPU, so I guess I wanted to see if the owner's export would give me a similar result or if maybe I messed up somewhere :)
I am also keen to try the ONNX for gpu!
@rachelbeeson @OrphBean Hi~ Please click the link to get the F5-TTS-ONNX-Float16 models. Then, follow the inference script to test the F5-TTS.
When i run this with the provided script on my 4090 it takes about 48 seconds to create a 7 second output - and the output is of a much lower quality than the original weights. Is this the expected behavior?
Unfortunately, that's not the case. If you're using an RTX-4090, please try the following script. In the console window, press Ctrl+F and search for the keyword 'number' to check how many nodes are placed on the TensorRT or CUDA provider. If most nodes are placed on the GPU but you're still experiencing poor performance, it may be an issue with the ONNX Runtime.
import re
import sys
import time
import jieba
import numpy as np
import onnxruntime
import torch
import torchaudio
from pypinyin import lazy_pinyin, Style
F5_project_path = "/home/dake/Downloads/F5-TTS-main" # The F5-TTS Github project download path. URL: https://github.com/SWivid/F5-TTS
onnx_model_A = "/home/dake/Downloads/F5_Preprocess.onnx" # The exported onnx model path.
onnx_model_B = "/home/dake/Downloads/F5_Transformer.onnx" # The exported onnx model path.
onnx_model_C = "/home/dake/Downloads/F5_Decode.onnx" # The exported onnx model path.
reference_audio = "/home/dake/Downloads/F5-TTS-main/src/f5_tts/infer/examples/basic/basic_ref_en.wav" # The reference audio path.
generated_audio = "/home/dake/Downloads/F5-TTS-main/src/f5_tts/infer/examples/basic/generated.wav" # The generated audio path.
ref_text = "Some call me nature, others call me mother nature." # The ASR result of reference audio.
gen_text = "I am a super hero." # The target TTS.
providers = [
('TensorrtExecutionProvider', {
'device_id': 0, # Select GPU to execute
'trt_max_workspace_size': 4 * 1024 * 1024 * 1024, # Set GPU memory usage limit, default is 4GB
'trt_fp16_enable': True, # Enable FP16 precision for faster inference
}),
('CUDAExecutionProvider', {
'device_id': 0,
'arena_extend_strategy': 'kNextPowerOfTwo',
'gpu_mem_limit': 4 * 1024 * 1024 * 1024,
'cudnn_conv_algo_search': 'EXHAUSTIVE',
'do_copy_in_default_stream': True,
}),
'CPUExecutionProvider'
]
HOP_LENGTH = 256 # Number of samples between successive frames in the STFT
SAMPLE_RATE = 24000 # The generated audio sample rate
RANDOM_SEED = 9527 # Set seed to reproduce the generated audio
NFE_STEP = 32 # F5-TTS model setting
SPEED = 1.0 # Set for talking speed. Only works with dynamic_axes=True
with open(f"{F5_project_path}/data/Emilia_ZH_EN_pinyin/vocab.txt", "r", encoding="utf-8") as f:
vocab_char_map = {}
for i, char in enumerate(f):
vocab_char_map[char[:-1]] = i
vocab_size = len(vocab_char_map)
def is_chinese_char(c):
cp = ord(c)
return (
0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
0x3400 <= cp <= 0x4DBF or # CJK Unified Ideographs Extension A
0x20000 <= cp <= 0x2A6DF or # CJK Unified Ideographs Extension B
0x2A700 <= cp <= 0x2B73F or # CJK Unified Ideographs Extension C
0x2B740 <= cp <= 0x2B81F or # CJK Unified Ideographs Extension D
0x2B820 <= cp <= 0x2CEAF or # CJK Unified Ideographs Extension E
0xF900 <= cp <= 0xFAFF or # CJK Compatibility Ideographs
0x2F800 <= cp <= 0x2FA1F # CJK Compatibility Ideographs Supplement
)
def convert_char_to_pinyin(text_list, polyphone=True):
final_text_list = []
merged_trans = str.maketrans({
'“': '"', '”': '"', '‘': "'", '’': "'",
';': ','
})
chinese_punctuations = set("。,、;:?!《》【】—…")
for text in text_list:
char_list = []
text = text.translate(merged_trans)
for seg in jieba.cut(text):
if seg.isascii():
if char_list and len(seg) > 1 and char_list[-1] not in " :'\"":
char_list.append(" ")
char_list.extend(seg)
elif polyphone and all(is_chinese_char(c) for c in seg):
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
for c in pinyin_list:
if c not in chinese_punctuations:
char_list.append(" ")
char_list.append(c)
else:
for c in seg:
if c.isascii():
char_list.append(c)
elif c in chinese_punctuations:
char_list.append(c)
else:
char_list.append(" ")
pinyin = lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
char_list.extend(pinyin)
final_text_list.append(char_list)
return final_text_list
def list_str_to_idx(
text: list[str] | list[list[str]],
vocab_char_map: dict[str, int], # {char: idx}
padding_value=-1
):
get_idx = vocab_char_map.get
list_idx_tensors = [torch.tensor([get_idx(c, 0) for c in t], dtype=torch.int32) for t in text]
text = torch.nn.utils.rnn.pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
return text
# ONNX Runtime settings
onnxruntime.set_seed(RANDOM_SEED)
session_opts = onnxruntime.SessionOptions()
session_opts.log_severity_level = 0 # error level, it a adjustable value.
session_opts.inter_op_num_threads = 0 # Run different nodes with num_threads. Set 0 for auto.
session_opts.intra_op_num_threads = 0 # Under the node, execute the operators with num_threads. Set 0 for auto.
session_opts.enable_cpu_mem_arena = True # True for execute speed; False for less memory usage.
session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
ort_session_A = onnxruntime.InferenceSession(onnx_model_A, sess_options=session_opts, providers=['CPUExecutionProvider'])
model_type = ort_session_A._inputs_meta[0].type
in_name_A = ort_session_A.get_inputs()
out_name_A = ort_session_A.get_outputs()
in_name_A0 = in_name_A[0].name
in_name_A1 = in_name_A[1].name
in_name_A2 = in_name_A[2].name
out_name_A0 = out_name_A[0].name
out_name_A1 = out_name_A[1].name
out_name_A2 = out_name_A[2].name
out_name_A3 = out_name_A[3].name
out_name_A4 = out_name_A[4].name
out_name_A5 = out_name_A[5].name
out_name_A6 = out_name_A[6].name
ort_session_B = onnxruntime.InferenceSession(onnx_model_B, sess_options=session_opts, providers=providers)
in_name_B = ort_session_B.get_inputs()
out_name_B = ort_session_B.get_outputs()
in_name_B0 = in_name_B[0].name
in_name_B1 = in_name_B[1].name
in_name_B2 = in_name_B[2].name
in_name_B3 = in_name_B[3].name
in_name_B4 = in_name_B[4].name
in_name_B5 = in_name_B[5].name
in_name_B6 = in_name_B[6].name
out_name_B0 = out_name_B[0].name
ort_session_C = onnxruntime.InferenceSession(onnx_model_C, sess_options=session_opts, providers=['CPUExecutionProvider'])
in_name_C = ort_session_C.get_inputs()
out_name_C = ort_session_C.get_outputs()
in_name_C0 = in_name_C[0].name
in_name_C1 = in_name_C[1].name
out_name_C0 = out_name_C[0].name
# Run F5-TTS by ONNX Runtime
audio, sr = torchaudio.load(reference_audio)
if sr != SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
audio = resampler(audio)
audio = audio.unsqueeze(0).numpy()
if "float16" in model_type:
audio = audio.astype(np.float16)
zh_pause_punc = r"。,、;:?!"
ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
ref_audio_len = audio.shape[-1] // HOP_LENGTH + 1
max_duration = np.array(ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / SPEED), dtype=np.int64)
gen_text = convert_char_to_pinyin([ref_text + gen_text])
text_ids = list_str_to_idx(gen_text, vocab_char_map).numpy()
time_step = np.array(0, dtype=np.int32)
print("\n\nRun F5-TTS by ONNX Runtime.")
start_count = time.time()
noise, rope_cos, rope_sin, cat_mel_text, cat_mel_text_drop, qk_rotated_empty, ref_signal_len = ort_session_A.run(
[out_name_A0, out_name_A1, out_name_A2, out_name_A3, out_name_A4, out_name_A5, out_name_A6],
{
in_name_A0: audio,
in_name_A1: text_ids,
in_name_A2: max_duration
})
while time_step < NFE_STEP:
print(f"NFE_STEP: {time_step}")
noise = ort_session_B.run(
[out_name_B0],
{
in_name_B0: noise,
in_name_B1: rope_cos,
in_name_B2: rope_sin,
in_name_B3: cat_mel_text,
in_name_B4: cat_mel_text_drop,
in_name_B5: qk_rotated_empty,
in_name_B6: time_step
})[0]
time_step += 1
generated_signal = ort_session_C.run(
[out_name_C0],
{
in_name_C0: noise,
in_name_C1: ref_signal_len
})[0]
end_count = time.time()
# Save to audio
audio_tensor = torch.tensor(generated_signal, dtype=torch.float32).squeeze(0)
torchaudio.save(generated_audio, audio_tensor, SAMPLE_RATE)
print(f"\nAudio generation is complete.\n\nONNXRuntime Time Cost in Seconds:\n{end_count - start_count:.3f}")
Thanks for your reply!
There is no 'number in my console output running the above script.
Microsoft Windows [Version 10.0.26100.2314] (c) Microsoft Corporation. All rights reserved.
I:\F5_4\F5-TTS\venv\Scripts>activate
(venv) I:\F5_4\F5-TTS\venv\Scripts>cd I:\F5_4\F5-TTS\src\f5_tts\infer
(venv) I:\F5_4\F5-TTS\src\f5_tts\infer>python infer_gradio_onnx2.py
(venv) I:\F5_4\F5-TTS\src\f5_tts\infer>python infer_gradio_onnx2.py
2024-11-16 00:19:27.4830676 [I:onnxruntime:, inference_session.cc:583 onnxruntime::InferenceSession::TraceSessionOptions] Session Options { execution_mode:0 execution_order:DEFAULT enable_profiling:0 optimized_model_filepath: enable_mem_pattern:1 enable_mem_reuse:1 enable_cpu_mem_arena:1 profile_file_prefix:onnxruntimeprofile session_logid: session_log_severity_level:0 session_log_verbosity_level:0 max_num_graph_transformation_steps:10 graph_optimization_level:3 intra_op_param:OrtThreadPoolParams { thread_pool_size: 0 auto_set_affinity: 0 allow_spinning: 1 dynamic_blockbase: 0 stack_size: 0 affinity_str: set_denormal_as_zero: 0 } inter_op_param:OrtThreadPoolParams { thread_pool_size: 0 auto_set_affinity: 0 allow_spinning: 1 dynamic_blockbase: 0 stack_size: 0 affinity_str: set_denormal_as_zero: 0 } use_per_session_threads:1 thread_pool_allow_spinning:1 use_deterministic_compute:0 config_options: { session.intra_op.allow_spinning: 1 session.inter_op.allow_spinning: 1 } }
2024-11-16 00:19:27.5004520 [I:onnxruntime:, inference_session.cc:483 onnxruntime::InferenceSession::ConstructorCommon::
Run F5-TTS by ONNX Runtime. NFE_STEP: 0 NFE_STEP: 1 NFE_STEP: 2 NFE_STEP: 3 NFE_STEP: 4 NFE_STEP: 5 NFE_STEP: 6 NFE_STEP: 7 NFE_STEP: 8 NFE_STEP: 9 NFE_STEP: 10 NFE_STEP: 11 NFE_STEP: 12 NFE_STEP: 13 NFE_STEP: 14 NFE_STEP: 15 NFE_STEP: 16 NFE_STEP: 17 NFE_STEP: 18 NFE_STEP: 19 NFE_STEP: 20 NFE_STEP: 21 NFE_STEP: 22 NFE_STEP: 23 NFE_STEP: 24 NFE_STEP: 25 NFE_STEP: 26 NFE_STEP: 27 NFE_STEP: 28 NFE_STEP: 29 NFE_STEP: 30 NFE_STEP: 31
Audio generation is complete.
ONNXRuntime Time Cost in Seconds: 48.918
The key message is as follows:
2024-11-16 00:19:29.0229437 [V:onnxruntime:, session_state.cc:1151 onnxruntime::VerifyEachNodeIsAssignedToAnEp] All nodes placed on [CPUExecutionProvider]. Number of nodes: 4312
It shows all F5_Transformer.onnx
nodes are running on the CPU, and the performance of '48.918 seconds' seems normal for the CPU provider. Ensure that the latest onnxruntime-gpu==1.20.0
is installed correctly and has been verified to work; any remaining issues are unclear to me as well.
thanks again. I have now the correct onnxruntime-gpu 1.20.0 and deleted my pervious install then validated that i in fact had the correct install.
It gives me a log flood that i exit after a minute or two.
I have double checked the correct onnxruntime and ensured I have the script as you pasted it. I only changed the paths.
And advice would be great. Thank you
Set the log level to 3 to hide most messages.
session_opts.log_severity_level = 3
Regarding issues with running models using ONNXRuntime and NVIDIA GPU providers, professional assistance can be provided by @Bigfishering.
Hello @Bigfishering, 我们想知道怎么用ONNX Runtime GPU + TensorrtExecutionProvider
, CUDAExecutionProvider
跑F5模型。您能不能分享Python范本呢?谢谢!
Would it be possible to share the onnx exports?