Fix the issue with meta spaces that applies the pre_tokenization to all the sub strings.
use tokenizers::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
use tokenizers::{PreTokenizedString, PreTokenizer};
use regex::Regex;
let pretok = Metaspace::new_with_prepend_scheme('▁', true, PrependScheme::Always)
let mut pretokenized = PreTokenizedString::from("Hey my friend <s>how▁are you");
let re_ref = Regex::new(r"(<s>)").unwrap();
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("AddedVocabulary bad split");
pretok.pre_tokenize(&mut pretokenized).unwrap();
Fix the issue with meta spaces that applies the pre_tokenization to all the sub strings.
with legacy:
without legacy: