heromantor / phpmorphy

GNU Lesser General Public License v2.1
72 stars 61 forks source link

Duplicate annotations #2

Open AlexeyKupershtokh opened 12 years ago

AlexeyKupershtokh commented 12 years ago

I've modified class phpMorphy_Morphier_MorphierAbstract this way:

function getParadigmCollection($word) {
    if(false === ($annots = $this->finder->findWord($word))) {
        return false;
    }

    // debug
    print '    +' . PHP_EOL;
    $a = array_map(function ($annot) { return preg_replace('/\s*[\r\n]+\s*/', ' ', print_r($annot, true)); }, $annots);
    sort($a);
    array_map(function ($annot) { print '    ' . $annot . PHP_EOL; }, $a);
    print '    -' . PHP_EOL;

    return $this->helper->getParadigmCollection($word, $annots);
}

Then I use standard aot dictionary from sourceforge and the following morphier:

$morphy = new phpMorphy('dicts/ru_ru_aot_withjo', 'ru_RU');
$morphy->getPredictByDatabaseMorphier()->getParadigmCollection('АБУШКАН');

And this shows me the following debug info:

+
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 11 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 12 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 19 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 23 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 3 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 4 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 6 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 6 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 6 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 6 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 7 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 9 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 49600 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 669 [forms_count] => 132 [packed_forms_count] => 76 [affixes_size] => 842 [form_no] => 64 [pos_id] => 21 [freq] => 3 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 49600 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 669 [forms_count] => 132 [packed_forms_count] => 76 [affixes_size] => 842 [form_no] => 64 [pos_id] => 21 [freq] => 3 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 49600 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 669 [forms_count] => 132 [packed_forms_count] => 76 [affixes_size] => 842 [form_no] => 64 [pos_id] => 21 [freq] => 4 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 1 [offset] => 680064 [cplen] => 0 [plen] => 0 [flen] => 6 [common_ancode] => 667 [forms_count] => 83 [packed_forms_count] => 54 [affixes_size] => 752 [form_no] => 42 [pos_id] => 21 [freq] => 11 [base_prefix] => [base_suffix] => КАТЬ ) 
Array ( [count] => 1 [offset] => 6976 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 683 [forms_count] => 12 [packed_forms_count] => 10 [affixes_size] => 70 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 6976 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 683 [forms_count] => 12 [packed_forms_count] => 10 [affixes_size] => 70 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 6976 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 683 [forms_count] => 12 [packed_forms_count] => 10 [affixes_size] => 70 [form_no] => 0 [pos_id] => 0 [freq] => 5 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 6976 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 687 [forms_count] => 12 [packed_forms_count] => 10 [affixes_size] => 70 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 7296 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 688 [forms_count] => 12 [packed_forms_count] => 9 [affixes_size] => 62 [form_no] => 0 [pos_id] => 0 [freq] => 5 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 688 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 691 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 691 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 691 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 1 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 691 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 3 [base_prefix] => [base_suffix] => ) 
Array ( [count] => 2 [offset] => 1383424 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 663 [forms_count] => 35 [packed_forms_count] => 21 [affixes_size] => 172 [form_no] => 13 [pos_id] => 1 [freq] => 6 [base_prefix] => [base_suffix] => ЫЙ ) 
Array ( [count] => 2 [offset] => 29376 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 680 [forms_count] => 6 [packed_forms_count] => 5 [affixes_size] => 36 [form_no] => 1 [pos_id] => 0 [freq] => 5 [base_prefix] => [base_suffix] => Ы ) 
Array ( [count] => 2 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 691 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 6 [base_prefix] => [base_suffix] => ) 
Array ( [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 7 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [offset] => 335264 [cplen] => 0 [plen] => 0 [flen] => 4 [common_ancode] => 667 [forms_count] => 76 [packed_forms_count] => 46 [affixes_size] => 560 [form_no] => 34 [pos_id] => 21 [freq] => 8 [base_prefix] => [base_suffix] => АТЬ ) 
Array ( [offset] => 410208 [cplen] => 0 [plen] => 0 [flen] => 6 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 652 [form_no] => 34 [pos_id] => 21 [freq] => 9 [base_prefix] => [base_suffix] => КАТЬ ) 
-

I think there are too many duplicates. Is this behavior wrong?

AlexeyKupershtokh commented 12 years ago

Using phpmorphy 0.3.7, the same debug (in the getWordDescriptor() method) and the following code:

$morphy = new phpMorphy('dicts/ru_ru_aot_withjo', 'ru_RU');
$morphy->getPredictByDatabaseMorphier()->getWordDescriptor('АБУШКАН');

Produces the following annots:

+
Array ( [count] => 1 [offset] => 169664 [cplen] => 0 [plen] => 0 [flen] => 2 [common_ancode] => 667 [forms_count] => 75 [packed_forms_count] => 46 [affixes_size] => 486 [form_no] => 34 [pos_id] => 21 [freq] => 12 [base_prefix] => [base_suffix] => ТЬ ) 
Array ( [count] => 2 [offset] => 1383424 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 663 [forms_count] => 35 [packed_forms_count] => 21 [affixes_size] => 172 [form_no] => 13 [pos_id] => 1 [freq] => 6 [base_prefix] => [base_suffix] => ЫЙ ) 
Array ( [count] => 2 [offset] => 8832 [cplen] => 0 [plen] => 0 [flen] => 0 [common_ancode] => 691 [forms_count] => 36 [packed_forms_count] => 29 [affixes_size] => 338 [form_no] => 0 [pos_id] => 0 [freq] => 6 [base_prefix] => [base_suffix] => ) 
-