messense / jieba-rs

The Jieba Chinese Word Segmentation Implemented in Rust
MIT License
738 stars 46 forks source link

能否增加删除词条的方法 #114

Open i2534 opened 4 months ago

i2534 commented 4 months ago

由于项目需求, 需要动态添加/删除词条. lib.rs 中已有 add_word, 需要 del_word 我自己先简单实现了一下, 用起来还凑合

/// delete word from dict, if the word doesn't exist, return `false`
pub fn del_word(&mut self, word: &str) -> bool {
    match self.cedar.exact_match_search(word) {
        Some((word_id, _, _)) => {
            let old_freq = self.records[word_id as usize].freq;
            self.total -= old_freq;
            // self.records.remove(word_id as usize);
            // 这里不能直接删除,因为删除后,后面的 word_id 就可能重复
            self.records[word_id as usize] = Record::new(0, String::new());
            self.cedar.erase(word);
            true
        }
        None => false,
    }
}

测试代码如下

#[test]
fn test_add_remove_word() {
    let mut jieba = Jieba::empty();
    jieba.add_word("东西", Some(1000), None);
    jieba.add_word("石墨烯", Some(1000), None);
    let words = jieba.cut("石墨烯是好东西", false);
    assert_eq!(words, vec!["石墨烯", "是", "好", "东西"]);

    // println!("{:?}", jieba.records);

    jieba.del_word("石墨烯");
    let words = jieba.cut("石墨烯是好东西", false);
    assert_eq!(words, vec!["石", "墨", "烯", "是", "好", "东西"]);

    // println!("{:?}", jieba.records);

    jieba.add_word("石墨烯", Some(1000), None);
    let words = jieba.cut("石墨烯是好东西", false);
    assert_eq!(words, vec!["石墨烯", "是", "好", "东西"]);

    // println!("{:?}", jieba.records);
}