palanceli / libgooglepinyin_old
1 stars 2 forks source link

词库生成 #2

Open palanceli opened 5 years ago

palanceli commented 5 years ago


palanceli commented 5 years ago




  1. libgooglepinyin/data/rawdict_utf16_65105_freq.txt包含了65105行数据:
    鼥 0.750684002197 1 ba
    釛 0.781224156844 1 ba
    軷 0.9691786136 1 ba
    釟 0.9691786136 1 ba
    蚆 1.15534975655 1 ba
    弝 1.52927012045 1 ba
  2. libgooglepinyin/data/valid_utf16.txt包含了16466个汉字:



// 加载数据,在内存中生成数据结构
> dictbuilder/pinyinime_dictbuilder.cpp:54 dict_trie->build_dict(...)
 > googlepinyin-static/dicttrie.cpp:113 dict_builder->build_dict(...)
  // 🏁1 将rawdict_utf16_65105_freq.txt中的内容读入DictBuilder::lemma_arr_
  > googlepinyin-static/dictbuilder.cpp:513 read_raw_dict(...) 
  // 将raw_spellings_按照字母排序,并拷贝到SpellingTable::spelling_buf_
  > googlepinyin-static/dictbuilder.cpp:524 spl_table_->arrange(...)
  > 🏁2 googlepinyin-static/dictbuilder.cpp:530 spl_trie.construct(...)
// 将内存中的词库写入文件
> dictbuilder/pinyinime_dictbuilder.cpp:67 dict_trie->save_dict(...)
palanceli commented 5 years ago

🏁1 将rawdict_utf16_65105_freq.txt中的内容读入DictBuilder::lemmaarr


// googlepinyin-static/dictdef.h:150
struct LemmaEntry {
  LemmaIdType idx_by_py;
  LemmaIdType idx_by_hz;
  char16 hanzi_str[kMaxLemmaSize + 1];                  // 中文词串

  // The SingleCharItem id for each Hanzi.
  uint16 hanzi_scis_ids[kMaxLemmaSize];

  uint16 spl_idx_arr[kMaxLemmaSize + 1];
  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];   [8][6+1]拼音串utf16表示
  unsigned char hz_str_len;                             // 中文词串长度
  float freq;                                           // 词频



// googlepinyin-static/dictbuilder.cpp:513
size_t DictBuilder::read_raw_dict(const char* fn_raw,
                                  const char *fn_validhzs,
                                  size_t max_item) {
  ..., kReadBufLen * 10)                       // 打开rawdict_utf16_65105_freq.txt

  valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);       // 读取valid_utf16.txt 得到所有汉字

  for (size_t i = 0; i < max_item; i++) {                           // 读取rawdict所有行
    utf16_reader.readline(read_buf, kReadBufLen)

    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);   // token1:中文词串
    size_t lemma_size = utf16_strlen(token);
    utf16_strcpy(lemma_arr_[i].hanzi_str, token);

    lemma_arr_[i].hz_str_len = token_size;                          // 中文词串长度

    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);   // token2:词频
    lemma_arr_[i].freq = utf16_atof(token);

    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);   // token3: GBK标志
    // 如果valid_utf16.txt没有内容,则只读取GBK=1的行,其余的抛弃
    // 如果valid_utf16.txt有内容,则只读取在该文件范围内的中文词串,其余的抛弃

    bool spelling_not_support = false;
    for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; 
         hz_pos++) {                                                // token4: 拼音串
      token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
      utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
      // 所有拼音都转成大写,除了ZH/CH/SH转为Zh/Ch/Sh

      if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
                                    lemma_arr_[i].freq)) {
        spelling_not_support = true;


// googlepinyin-satic/spellingtable.cpp:138
bool SpellingTable::put_spelling(const char* spelling_str, double freq) {

  total_freq_ += freq;                                      // 总词频累加

  size_t hash_pos = get_hash_pos(spelling_str);

  raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';

  if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
              spelling_size_ - 1) == 0) {
    raw_spellings_[hash_pos].freq += freq;                  // 同音节词频累加
    return true;

  // 以下解决碰撞问题,基本策略还是让同音节词频累加


鼥 0.750684002197 1 ba
釛 0.781224156844 1 ba
軷 0.9691786136 1 ba


// googlepinyin-static/dictdef.h:150
struct LemmaEntry {
  LemmaIdType idx_by_py;
  LemmaIdType idx_by_hz;
  char16 hanzi_str[kMaxLemmaSize + 1];                  // 中文词串

  // The SingleCharItem id for each Hanzi.
  uint16 hanzi_scis_ids[kMaxLemmaSize];

  uint16 spl_idx_arr[kMaxLemmaSize + 1];
  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];   [8][6+1]拼音串utf16表示
  unsigned char hz_str_len;                             // 中文词串长度
  float freq;                                           // 词频

组织后的结果为: LemmaEntry


// googlepinyin-static/spellingtable.h:29
typedef struct {
  char str[kMaxSpellingSize + 1];
  double freq;
} RawSpelling, *PRawSpelling;

组织后的结果为: spl

0312birdzhang commented 5 years ago

请教一下用户词库到底是怎么用的? 试了很多方法都是打开词库出错。 我目前的解决方法是追加到原始的词库中:

1. 深蓝词汇转换搜狗常用词组到谷歌拼音,更改词频 ,保存为test.txt,转换为utf-8 无bom格式
2. awk '$2 = $2 FS "0"' test.txt > test2.txt # 插入一列,因为转换出来的少一列
3. awk '{if($2 < 5000){$2 = $2/209337 + 209337/$2}else{$2 = $2/209337}; print}' test2.txt > test3.txt # 词频降低一些,很不靠谱的方法
4. iconv -f utf-8 -t utf-16 test3.txt > test4.txt #转换为utf16
5. cat ../data/rawdict_utf16_65105_freq.txt.origin test4.txt > ../data/rawdict_utf16_65105_freq.txt #追加到后面
6. ./pinyinime_dictbuilder ../data/rawdict_utf16_65105_freq.txt ../data/valid_utf16.txt  #生成词库


read succesfully, lemma num: 138900
spelling tree construct successfully.

------------STAT INFO-------------
[root is layer -1]
.. max_sonbuf_len per layer(from layer 0):
   413, 309, 57, 7, 0, 0, 0, 0, -, 
.. max_homobuf_len per layer:
   -, 312, 29, 4, 6, 0, 0, 0, 0, 
.. total_son_num per layer:
   413, 46280, 42836, 23106, 0, 0, 0, 0, -, 
.. total_node_hasson per layer:
   1, 399, 21417, 21905, 0, 0, 0, 0, 0, 
.. total_sonbuf_num per layer:
   1, 399, 21417, 21905, 0, 0, 0, 0, -, 
.. total_sonbuf_allnoson per layer:
   0, 5, 9253, 21905, 0, 0, 0, 0, -, 
.. total_node_in_sonbuf_allnoson per layer:
   0, 18, 11779, 23106, 0, 0, 0, 0, -, 
.. total_homo_num per layer:
   0, 17033, 75293, 22880, 23694, 0, 0, 0, 0, 
.. son buf allocation number with only 1 son: 34234
.. son buf allocation number with more than 1 son: 9488
.. total lemma node number: 112636
Build dictionary successfully.
Save dictionary successfully.
palanceli commented 5 years ago

用户词库部分还没有看到,稍后看到那里才能答复你 😸

0312birdzhang commented 5 years ago


weizhizhanghao commented 4 years ago

请教一下用户词库到底是怎么用的? 试了很多方法都是打开词库出错。 我目前的解决方法是追加到原始的词库中:

1. 深蓝词汇转换搜狗常用词组到谷歌拼音,更改词频 ,保存为test.txt,转换为utf-8 无bom格式
2. awk '$2 = $2 FS "0"' test.txt > test2.txt # 插入一列,因为转换出来的少一列
3. awk '{if($2 < 5000){$2 = $2/209337 + 209337/$2}else{$2 = $2/209337}; print}' test2.txt > test3.txt # 词频降低一些,很不靠谱的方法
4. iconv -f utf-8 -t utf-16 test3.txt > test4.txt #转换为utf16
5. cat ../data/rawdict_utf16_65105_freq.txt.origin test4.txt > ../data/rawdict_utf16_65105_freq.txt #追加到后面
6. ./pinyinime_dictbuilder ../data/rawdict_utf16_65105_freq.txt ../data/valid_utf16.txt  #生成词库


read succesfully, lemma num: 138900
spelling tree construct successfully.

------------STAT INFO-------------
[root is layer -1]
.. max_sonbuf_len per layer(from layer 0):
   413, 309, 57, 7, 0, 0, 0, 0, -, 
.. max_homobuf_len per layer:
   -, 312, 29, 4, 6, 0, 0, 0, 0, 
.. total_son_num per layer:
   413, 46280, 42836, 23106, 0, 0, 0, 0, -, 
.. total_node_hasson per layer:
   1, 399, 21417, 21905, 0, 0, 0, 0, 0, 
.. total_sonbuf_num per layer:
   1, 399, 21417, 21905, 0, 0, 0, 0, -, 
.. total_sonbuf_allnoson per layer:
   0, 5, 9253, 21905, 0, 0, 0, 0, -, 
.. total_node_in_sonbuf_allnoson per layer:
   0, 18, 11779, 23106, 0, 0, 0, 0, -, 
.. total_homo_num per layer:
   0, 17033, 75293, 22880, 23694, 0, 0, 0, 0, 
.. son buf allocation number with only 1 son: 34234
.. son buf allocation number with more than 1 son: 9488
.. total lemma node number: 112636
Build dictionary successfully.
Save dictionary successfully.
