Literal tengwa {} not recognized when using $ for end of regex

b-runo commented 2 years ago

In my code for Mode Brazilian Portuguese i have the following regex in preprocess:

"/an$/": "ã",
"/en$/": "e͂",
"/in$/": "i͂",
"/on$/": "õ",
"/un$/": "u͂",

Simple, always that an word finish with AN need be replaced with your nasalized vowel (eg. Ã). When i put the string brun, is correctly replaced by bru͂, cause is the end of string. But when i put brun{right-curl}, the preprocess dont recognizes the {right-curl} how the end of the string, and write bru͂o in tengwar, instead bruno. Can you help me with this Feanor's apprentice?

arnog commented 2 years ago

I have changed the Tecendil transcription engine to now consider substrings that include tengwar literal as being part of the word being processed for the purpose of preprocessing. This should result in the desired behavior. The tecendil.com website has been updated with that change.

b-runo commented 2 years ago

Sorry but this break other rules, and can affect other languages too (entry brun replaced by bru͂ by the preprocess):

the map is not working for special characters: "_nzl_": "[tilde-above]{}", // i used this _hack_ before the change for nasalized sounds, but now stop work

"ã": "[triple-dot-above][tilde-above]{}",
"e͂": "[acute][tilde-above]{}",
"i͂": "[dot-above][tilde-above]{}",
"õ": "[right-curl][tilde-above]{}",
"u͂": "[left-curl][tilde-above]{}",

This is caused because some nasal sounds in our language doesnt exist in yours characters(the ã and õ works), if make this ("_nzl_": "[tilde-above]{}") work again i think that will resolve. (im programer too so can speak more technically with me). Look when i use the character ã (entry bran replaced by brã, and bran{right-curl}):

Works great, so if the special characters of map back work like before i think that will work. Sorry for the inconvenience.

arnog commented 2 years ago

Could you share your mode file? I'm not sure I understand what you are saying.

b-runo commented 2 years ago

im using a code like this in preprocess for nasal sound:

"/([aeiouà-ú])nt/": "$1_nzl_t",
"/([aeiouà-ú])mp/": "$1_nzl_p",
"/([aeiouà-ú])nc/": "$1_nzl_c",
"/([aeiouà-ú])nch/": "$1_nzl_ch",
"/([aeiouà-ú])ns/": "$1_nzl_s",
"/([aeiouà-ú])nx/": "$1_nzl_x",
"/([aeiouà-ú])nd/": "$1_nzl_d",
"/([aeiouà-ú])mb/": "$1_nzl_b",
"/([aeiouà-ú])ng/": "$1_nzl_g",
"/([aeiouà-ú])m$/": "$1_nzl_",

and in map i replace the _nzl_: "_nzl_": "[tilde-above]{}",

This was working very well for all vowels before the update except when i put the literal tengwa as I described in the first post. Now the literal tengwa work, but the nasalized vowels (and the replace in map _nzl_) stop working.

b-runo commented 2 years ago

Here is my mode code:

{
  // Short-name
  "name": "Português Fonêmico",
  //"languageCode": "por", // ISO 639-3 - portugal?
  "languageCode": "pt-br", // ISO 639-1

  "rrule": false,

  "preprocess": {

    // Desambiguar pronúncia, "sç"/"ç" -> "ss". Exs.: cresça. calça
    "sç": "ss",
    "ç": "ss",

    // Redução da vogal tônica final
    "/([^aeiouà-ú])e$/": "$1i",
    "/([^aeiouà-ú])o$/": "$1u",
    "/([aeiouà-ú])l([^haeiouà-ú])/": "$1u$2",
    "/([aeiouà-ú])l$/": "$1u",

    // Consoantes nasais "m/n" -------------------------------------------- O simbolo _nzl_ é substituído no MAP por ~ para indicar o som nasal
    /*
    -check:
    banana
    tambem +++++++++
    cantam +++++++++
    album
    */
    "/([aeiouà-ú])nt/": "$1_nzl_t",
    "/([aeiouà-ú])mp/": "$1_nzl_p",
    "/([aeiouà-ú])nc/": "$1_nzl_c",
    "/([aeiouà-ú])nch/": "$1_nzl_ch",
    "/([aeiouà-ú])ns/": "$1_nzl_s",
    "/([aeiouà-ú])nx/": "$1_nzl_x",
    "/([aeiouà-ú])nd/": "$1_nzl_d",
    "/([aeiouà-ú])mb/": "$1_nzl_b",
    "/([aeiouà-ú])ng/": "$1_nzl_g",
    "/([aeiouà-ú])m$/": "$1_nzl_",
    //"/([aeiouà-ú])n(^$)/": "$1_nzl_", // Verificar ditongo - i quando E eg. mantem - o quando A eg. montam
    //"/([aeiouà-ú])n$/": "$1_nzl_", // !!!!!!!!!!!!!!!! Erro (eg. Brun[o]), o código não reconhece a letra literal [o] então colo as vogais U e O em cima do R. Afeta as(z)[a]

    "/an$/": "ã",
    "/en$/": "e͂",
    "/in$/": "i͂",
    "/on$/": "õ",
    "/un$/": "u͂",

    // Desambiguar pronúncia, "sc" -> "ss". Exs.: crescer, crescido
    "/sc([ei])/": "c$1",

    // Desambiguar pronúncia, "ce" -> "se", "ci" -> "si"
    "ce": "sse",
    "ci": "ssi",
    "cé": "ssé",
    "cí": "ssí",

    // Desambiguar pronúncia, "s" -> "z". Usa-se S com som de Z entre duas vogais. Exs.: crise, aviso, empresa, raposa, tesouro
    "/([aeiouà-ú])s([aeiouà-ú])/": "$1z$2",

    // Desambiguar pronúncia, "ch" -> "x". Exs.: chícara, choque, cheiro, churrasco
    "ch": "x",

    // Desambiguar pronúncia, "ge" -> "je", "gi" -> "ji". Exs.: gênio, girafa, gíria
    "/g([eéèêëiíìEÉÈÊËIÍÌ])/": "j$1",
    /*
    "ge": "je",
    "gi": "ji",
    "gé": "jé",
    "gí": "jí",
    */

    // Desambiguar pronúncia, "gue" -> "ge", "gui" -> "gi"
    "gue": "ge",
    "gui": "gi",
    "gué": "gé",
    "guí": "gí",

    // Desambiguar pronúncia, "que" -> "ke", "qui" -> "ki". Exs.: queijo, quiabo, quero, quinto
    "que": "ke",
    "qui": "ki",
    "qué": "ké",
    "quí": "kí",

    // Retain "ü" in "qüe", "qüi", "güe", "güi" as "u"
    //"/gü([eéèêëiíìEÉÈÊËIÍÌ])/": "gu$1",
    "/qü([eéèêëiíìEÉÈÊËIÍÌ])/": "qu$1",

    // ****** Desambiguar pronúncia, "r": Quando é simples e forte

    // Início das palavras: R forte. Exs.: rato, rua, riso
    "/^r/": "rr",

    // Entre duas vogais: R simples. Exs.: moradia, era, tira
    "/([aeiouà-ú])r([aeiouà-ú])/": "$1r$2",

    // Final das palavras: R simples, Exs.: mar, comer, abrir
    "/([^AaEeIiOoUuÁáÉéÍíÓóÚúRr])r/": "$1r",

    // R forte depois de L e N seguido por vogal. Exs.: enredo, enroscado, (lr existe palavras???)
    "/([lLnN])r/": "$1rr"

  },

  "map": {
    // 'Hack' para sons nasais
    "_nzl_": "[tilde-above]{}",

    // ----------------------------------------------------------------------------
    //
    // VOGAIS
    // y (como som de "i")
    "y": "[dot-above]{}",

    "a": "[triple-dot-above]{}",
    "e": "[acute]{}",
    "i": "[dot-above]{}",
    "o": "[right-curl]{}",
    "u": "[left-curl]{}",

    // Nasalizadas
    "ã": "[triple-dot-above][tilde-above]{}",
    "e͂": "[acute][tilde-above]{}",
    "i͂": "[dot-above][tilde-above]{}",
    "õ": "[right-curl][tilde-above]{}",
    "u͂": "[left-curl][tilde-above]{}",

    // Com Entonação
    "é": "[double-dot-above]{}",
    "ó": "[breve]{}",

    // Acentuação que não altera a pronúncia da vogal no MTP
    // (^)
    "â": "[triple-dot-above]{}",
    "ê": "[acute]{}",
    "ô": "[right-curl]{}",
    // (´)
    "á": "[triple-dot-above]{}",
    "í": "[dot-above]{}",
    "ú": "[left-curl]{}",
    // (`)
    "à": "[triple-dot-above]{}",
    // (¨)
    "ä": "[triple-dot-above]{}",
    "ë": "[acute]{}",
    "ö": "[right-curl]{}",
    "ü": "[left-curl]{}",

    // ----------------------------------------------------------------------------
    // CONSONANTS
    //

    "b": "{umbar}",
    "c": "{calma}",
    "d": "{ando}",
    "f": "{formen}",
    "g": "{anga}",
    "gü": "{ungwe}",
    "j": "{anca}",
    "k": "{calma}",
    "l": "{lambe}",
    "lh": "{alda}",
    "m": "{malta}",
    "n": "{nuumen}",
    "nh": "{noldo}",
    "p": "{parma}",
    "q": "{quesse}",
    "qu": "{quesse}",
    "r": "{oore}",
    "rr": "{roomen}",

    "s": "{silme}",
    "ss": "{silme}",
    "^s": "{silme}",
    "s$": "[upward-hook]"

    "t": "{tinco}",
    "v": "{ampa}",
    "x": "{harma}",
    "z": "{esse-nuquerna}"

  },

  "words": {}
}

arnog commented 2 years ago

Ah, OK. Yeah, that's not going to work because _ is not considered a valid character inside a word.

You have three options:

use nzl instead of _nzl_. nzl is probably an unlikely character combination in Portuguese, so this might work.
instead of _nzl_ use a Unicode codepoint in the range A-Za-zÀ-ÖØ-öø-ÿĀ-ſ*, which is the set of characters that are considered part of a word. For example, use ŋ U+014B instead of _nzl_
change the set of characters considered part of a word by specifying a value for the wordPattern property in your mode file. The default value is A-Za-zÀ-ÖØ-öø-ÿĀ-ſ*. You could either try to add _ as a valid character in a word, or add some other Unicode codepoint of your choosing, which you would then use instead of _nzl_.

b-runo commented 2 years ago

Thaaanks master, i didn't know about the range A-Za-zÀ-ÖØ-öø-ÿĀ-ſ*, using the ŋ instead \_nzl\_ works great.

arnog / tecendil-js

Literal tengwa {} not recognized when using $ for end of regex #44