brailcom / speechd

Common high-level interface to speech synthesis
GNU General Public License v2.0
212 stars 62 forks source link

Add support for Mimic #19

Open LuccoJ opened 6 years ago

LuccoJ commented 6 years ago

Mimic is a compact speech synthesis engine developed as part of the Mycroft project, based on Flite, and largely compatible with it from the command line interface.

An issue already exist at Mycroft to add speech-dispatcher support for Mimic, but it doesn't seem to have been worked on. I thought it would be at least beneficial to raise it here for mutual awareness.

alexarnaud commented 6 years ago

Le 24/03/2018 à 18:50, Lorenzo J. Lucchini a écrit :

Mimic https://mycroft.ai/documentation/mimic/ is a compact speech synthesis engine developed as part of the Mycroft project, based on Flite, and largely compatible with it from the command line interface.

Are you aware if the C API is the same? What change has been made to Mimic from Flite? If it's just a matter of naming, we could imagine to copy the flite module and just rename things.

Best regards, Alex.

LuccoJ commented 6 years ago

@alexarnaud I don't know, but I installed the headers for both Mimic and Flite on my system, and I see these differences:

$ diff <(rpm -ql flite-devel | grep "\.h$" | sed "s/flite/ttsmimic/" | sort) <(rpm -ql mimic-devel | grep "\.h$" | sort)
0a1
> /usr/include/ttsmimic/config.h
4a6
> /usr/include/ttsmimic/cst_cg.h
9a12
> /usr/include/ttsmimic/cst_ffeatures.h
11a15
> /usr/include/ttsmimic/cst_icu.h
16d19
< /usr/include/ttsmimic/cst_math.h
37,38c40,51
< /usr/include/ttsmimic/flite.h
< /usr/include/ttsmimic/flite_version.h
---
> /usr/include/ttsmimic/cst_wchar.h
> /usr/include/ttsmimic/lang/cmu_grapheme_lang.h
> /usr/include/ttsmimic/lang/cmu_grapheme_lex.h
> /usr/include/ttsmimic/lang/cmu_indic_lang.h
> /usr/include/ttsmimic/lang/cmu_indic_lex.h
> /usr/include/ttsmimic/lang/cmu_lex.h
> /usr/include/ttsmimic/lang/usenglish.h
> /usr/include/ttsmimic/lang/us_f0.h
> /usr/include/ttsmimic/lang/us_ffeatures.h
> /usr/include/ttsmimic/lang/us_regexes.h
> /usr/include/ttsmimic/lang/us_text.h
> /usr/include/ttsmimic/mimic.h

It looks like the vast majority of the files exist in both packages.

As to the content, /usr/include/ttsmimic/mimic.h and /usr/include/flite/flite.h look similar (though Mimic has more functions), but most functions have been renamed from flite* to mimic, and a "float dur" argument has been added to the standard Flite functions:

$ diff /usr/include/ttsmimic/mimic.h /usr/include/flite/flite.h 
40,41c40,41
< #ifndef _MIMIC_H__
< #define _MIMIC_H__
---
> #ifndef _FLITE_H__
> #define _FLITE_H__
45c45
< #endif                          /* __cplusplus */
---
> #endif /* __cplusplus */
47d46
< #include "config.h"
69,71d67
<     extern cst_val *mimic_voice_list;
<     extern cst_lang mimic_lang_list[20];
< 
73,74c69,83
<     int mimic_init();
<     int mimic_exit();
---
> int flite_init();
> 
> cst_wave *flite_text_to_wave(const char *text,cst_voice *voice);
> float flite_file_to_speech(const char *filename, 
>              cst_voice *voice,
>              const char *outtype);
> float flite_text_to_speech(const char *text, 
>              cst_voice *voice,
>              const char *outtype);
> float flite_phones_to_speech(const char *text, 
>                cst_voice *voice,
>                const char *outtype);
> float flite_tokens_to_speech(cst_utterance *u,
>                cst_voice *voice,
>                const char *outtype);
76,138c85,86
< /* General top level functions */
<     cst_voice *mimic_voice_select(const char *name);
<     cst_voice *mimic_voice_load(const char *voice_filename);
<     int mimic_voice_dump(cst_voice *voice, const char *voice_filename);
<     int mimic_file_to_speech(const char *filename, cst_voice *voice,
<                              const char *outtype, float *dur);
<     int mimic_text_to_speech(const char *text, cst_voice *voice,
<                              const char *outtype, float *dur);
<     int mimic_phones_to_speech(const char *text, cst_voice *voice,
<                                const char *outtype, float *dur);
<     int mimic_ssml_file_to_speech(const char *filename, cst_voice *voice,
<                                     const char *outtype, float *dur);
<     int mimic_ssml_text_to_speech(const char *text, cst_voice *voice,
<                                     const char *outtype, float *dur);
<     int mimic_voice_add_lex_addenda(cst_voice *v, const cst_string *lexfile);
< 
< /* Lower lever user functions */
<     cst_wave *mimic_text_to_wave(const char *text, cst_voice *voice);
<     cst_utterance *mimic_synth_text(const char *text, cst_voice *voice);
<     cst_utterance *mimic_synth_phones(const char *phones, cst_voice *voice);
< 
<     int mimic_ts_to_speech(cst_tokenstream *ts, cst_voice *voice,
<                            const char *outtype, float *dur);
<     cst_utterance *mimic_do_synth(cst_utterance *u,
<                                   cst_voice *voice, cst_uttfunc synth);
<     int mimic_process_output(cst_utterance *u,
<                              const char *outtype, int append, float *dur);
< 
< /* for voices with external voxdata */
<     int mimic_mmap_clunit_voxdata(const char *voxdir, cst_voice *voice);
<     int mimic_munmap_clunit_voxdata(cst_voice *voice);
< 
< /* mimic public export wrappers for features access */
<     int mimic_get_param_int(const cst_features *f, const char *name, int def);
<     float mimic_get_param_float(const cst_features *f, const char *name,
<                                 float def);
<     const char *mimic_get_param_string(const cst_features *f,
<                                        const char *name, const char *def);
<     const cst_val *mimic_get_param_val(const cst_features *f,
<                                        const char *name, cst_val *def);
<     void mimic_feat_set_int(cst_features *f, const char *name, int v);
<     void mimic_feat_set_float(cst_features *f, const char *name, float v);
<     void mimic_feat_set_string(cst_features *f, const char *name,
<                                const char *v);
<     void mimic_feat_set(cst_features *f, const char *name, const cst_val *v);
<     int mimic_feat_remove(cst_features *f, const char *name);
< 
<     const char *mimic_ffeature_string(const cst_item *item,
<                                       const char *featpath);
<     int mimic_ffeature_int(const cst_item *item, const char *featpath);
<     float mimic_ffeature_float(const cst_item *item, const char *featpath);
<     const cst_val *mimic_ffeature(const cst_item *item, const char *featpath);
<     cst_item *mimic_path_to_item(const cst_item *item, const char *featpath);
< 
< /* These functions are *not* thread-safe, they are designed to be called */
< /* before the initial synthesis occurs */
<     int mimic_add_voice(cst_voice *voice);
<     int mimic_add_lang(const char *langname,
<                        void (*lang_init) (cst_voice *vox),
<                        cst_lexicon *(*lex_init) ());
< /* These are init functions for generic grapheme based voices */
<     void utf8_grapheme_lang_init(cst_voice *v);
<     cst_lexicon *utf8_grapheme_lex_init(void);
---
> cst_utterance *flite_synth_text(const char *text,cst_voice *voice);
> cst_utterance *flite_synth_phones(const char *phones,cst_voice *voice);
141,142c89,92
< }                               /* extern "C" */
< #endif                          /* __cplusplus */
---
> }; /* extern "C" */
> #endif /* __cplusplus */
> 
>