Closed p5pRT closed 14 years ago
This is a bug report for perl from john.imrie@vodafoneemail.co.uk\, generated with the help of perlbug 1.35 running under perl v5.8.8.
----------------------------------------------------------------- Currently the Unicode character class \p{ccc} either dies when you use a numeric code of does not correctly match when you use a letter code.
The diffs that follow
a) Allow numeric codes to correctly work so that \p{ccc=0} and \p{ccc=000} both work as expected.
and
b) Allow both numeric and alphabetic codes match as expected
$PVA_reverse{$pa}{lc $val}; + $val+=0 if $val=~/^\d+$/; + my $f = defined $PropValueAlias{$pa}{$val} ? $val : $PVA_reverse{$pa}{lc $val};
- if ($pa and $f) { + if ($pa and defined $f) { + $f+=0 if $f=~/^\d+$/; $pa = "gc_sc" if $pa eq "gc" or $pa eq "sc"; $file = "unicore/lib/$pa/$PVA_abbr_map{$pa}{lc $f}.pl"; last GETFILE;
--- /usr/lib/perl5/5.8.8/unicore/mktables 2009-02-19 17:50:16.000000000 +0000 +++ mktables 2009-02-27 17:09:23.000000000 +0000 @@ -288\,6 +288\,11 @@ if ($prop eq 'ccc') { $PropValueAlias{$prop}{$data[1]} = [ @data[0\,2] ]; $PVA_reverse{$prop}{$data[2]} = [ @data[0\,1] ]; + # Fixup for numeric CCC + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[1]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[0]} = $data[0]; } else { next if $data[0] eq "n/a"; @@ -302\,6 +307\,7 @@ $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0];
+ next if $prop = 'ccc'; my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop; $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0]; } @@ -775\,7 +781\,6 @@ { my $Bidi = Table->New(); my $Deco = Table->New(); - my $Comb = Table->New(); my $Number = Table->New(); my $Mirrored = Table->New();#Is => 'Mirrored'\, #Desc => "Mirrored in bidirectional text"\, @@ -784\,6 +789\,7 @@ my %DC; my %Bidi; my %Number; + my %Comb; $DC{can} = Table->New(); $DC{com} = Table->New();
@@ -983\,7 +989\,12 @@ $To{Digit}->Append($code\, $decimal) if length $decimal;
$Bidi->Append($code\, $bidi); - $Comb->Append($code\, $comb) if $comb; + # Fixup for CCC + if (defined $comb) { # $comb can be 0 + $Comb{$comb} ||= Table->New(); + $Comb{$comb}->Append($code) + } + $Number->Append($code\, $number) if length $number;
length($decimal) and ($Number{De} ||= Table->New())->Append($code) @@ -1125\,13 +1136\,11 @@ ); }
- $Comb->Write("CombiningClass.pl"); - for (keys %{ $PropValueAlias{ccc} }) { - my ($code\, $name) = @{ $PropValueAlias{ccc}{$_} }; - (my $c = Table->New())->Append($code); - $c->Write( + # $Comb->Write("CombiningClass.pl"); + for (keys %Comb) { + $Comb{$_}->Write( ["lib"\,"ccc"\,"$_.pl"]\, - "CombiningClass category '$name'" + "CombiningClass category '$_'" ); }
John Imrie
Patch for mktables is incorrect.
Corrected patch follows
--- /usr/lib/perl5/5.8.8/unicore/mktables 2009-02-19 17:50:16.000000000 +0000 +++ mktables 2009-02-27 17:09:23.000000000 +0000 @@ -288\,6 +288\,11 @@ if ($prop eq 'ccc') { $PropValueAlias{$prop}{$data[1]} = [ @data[0\,2] ]; $PVA_reverse{$prop}{$data[2]} = [ @data[0\,1] ]; + # Fixup for numeric CCC + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[1]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[0]} = $data[0]; } else { next if $data[0] eq "n/a"; @@ -302\,6 +307\,7 @@ $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0];
+ next if $prop eq 'ccc'; my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop; $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0]; } @@ -775\,7 +781\,6 @@ { my $Bidi = Table->New(); my $Deco = Table->New(); - my $Comb = Table->New(); my $Number = Table->New(); my $Mirrored = Table->New();#Is => 'Mirrored'\, #Desc => "Mirrored in bidirectional text"\, @@ -784\,6 +789\,7 @@ my %DC; my %Bidi; my %Number; + my %Comb; $DC{can} = Table->New(); $DC{com} = Table->New();
@@ -983\,7 +989\,12 @@ $To{Digit}->Append($code\, $decimal) if length $decimal;
$Bidi->Append($code\, $bidi); - $Comb->Append($code\, $comb) if $comb; + # Fixup for CCC + if (defined $comb) { # $comb can be 0 + $Comb{$comb} ||= Table->New(); + $Comb{$comb}->Append($code) + } + $Number->Append($code\, $number) if length $number;
length($decimal) and ($Number{De} ||= Table->New())->Append($code) @@ -1125\,13 +1136\,11 @@ ); }
- $Comb->Write("CombiningClass.pl"); - for (keys %{ $PropValueAlias{ccc} }) { - my ($code\, $name) = @{ $PropValueAlias{ccc}{$_} }; - (my $c = Table->New())->Append($code); - $c->Write( + # $Comb->Write("CombiningClass.pl"); + for (keys %Comb) { + $Comb{$_}->Write( ["lib"\,"ccc"\,"$_.pl"]\, - "CombiningClass category '$name'" + "CombiningClass category '$_'" ); }
______________________________________________
This email has been scanned by Netintelligence
http://www.netintelligence.com/email
John wrote:
Patch for mktables is incorrect.
Corrected patch follows
--- /usr/lib/perl5/5.8.8/unicore/mktables 2009-02-19 17:50:16.000000000 +0000 +++ mktables 2009-02-27 17:09:23.000000000 +0000 @@ -288\,6 +288\,11 @@ if ($prop eq 'ccc') { $PropValueAlias{$prop}{$data[1]} = [ @data[0\,2] ]; $PVA_reverse{$prop}{$data[2]} = [ @data[0\,1] ]; + # Fixup for numeric CCC + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[1]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[0]} = $data[0]; } else { next if $data[0] eq "n/a"; @@ -302\,6 +307\,7 @@ $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0];
+ next if $prop eq 'ccc'; my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop; $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0]; } @@ -775\,7 +781\,6 @@ { my $Bidi = Table->New(); my $Deco = Table->New(); - my $Comb = Table->New(); my $Number = Table->New(); my $Mirrored = Table->New();#Is => 'Mirrored'\, #Desc => "Mirrored in bidirectional text"\, @@ -784\,6 +789\,7 @@ my %DC; my %Bidi; my %Number; + my %Comb; $DC{can} = Table->New(); $DC{com} = Table->New();
@@ -983\,7 +989\,12 @@ $To{Digit}->Append($code\, $decimal) if length $decimal;
$Bidi->Append($code\, $bidi); - $Comb->Append($code\, $comb) if $comb; + # Fixup for CCC + if (defined $comb) { # $comb can be 0 + $Comb{$comb} ||= Table->New(); + $Comb{$comb}->Append($code) + } + $Number->Append($code\, $number) if length $number;
length($decimal) and ($Number{De} ||= Table->New())->Append($code) @@ -1125\,13 +1136\,11 @@ ); }
- $Comb->Write("CombiningClass.pl"); - for (keys %{ $PropValueAlias{ccc} }) { - my ($code\, $name) = @{ $PropValueAlias{ccc}{$_} }; - (my $c = Table->New())->Append($code); - $c->Write( + # $Comb->Write("CombiningClass.pl"); + for (keys %Comb) { + $Comb{$_}->Write( ["lib"\,"ccc"\,"$_.pl"]\, - "CombiningClass category '$name'" + "CombiningClass category '$_'" ); }
______________________________________________ This email has been scanned by Netintelligence http://www.netintelligence.com/email
FYI\,
There are a number of problems in mktables besides the ccc ones. I've been working on revamping mktables to correct all these\, and expect to finish it in a week.
Note that some ccc values have no names\, but should be referrable in re's\, hence the file names should be something like 0.pl\, 240.pl
The RT System itself - Status changed from 'new' to 'open'
John wrote:
Patch for mktables is incorrect.
Corrected patch follows
--- /usr/lib/perl5/5.8.8/unicore/mktables 2009-02-19 17:50:16.000000000 +0000 +++ mktables 2009-02-27 17:09:23.000000000 +0000 @@ -288\,6 +288\,11 @@ if ($prop eq 'ccc') { $PropValueAlias{$prop}{$data[1]} = [ @data[0\,2] ]; $PVA_reverse{$prop}{$data[2]} = [ @data[0\,1] ]; + # Fixup for numeric CCC + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; + $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[1]} = $data[0]; + $utf8::PVA_abbr_map{$prop}{lc $data[0]} = $data[0]; } else { next if $data[0] eq "n/a"; @@ -302\,6 +307\,7 @@ $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0];
+ next if $prop eq 'ccc'; my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop; $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0]; } @@ -775\,7 +781\,6 @@ { my $Bidi = Table->New(); my $Deco = Table->New(); - my $Comb = Table->New(); my $Number = Table->New(); my $Mirrored = Table->New();#Is => 'Mirrored'\, #Desc => "Mirrored in bidirectional text"\, @@ -784\,6 +789\,7 @@ my %DC; my %Bidi; my %Number; + my %Comb; $DC{can} = Table->New(); $DC{com} = Table->New();
@@ -983\,7 +989\,12 @@ $To{Digit}->Append($code\, $decimal) if length $decimal;
$Bidi->Append($code\, $bidi); - $Comb->Append($code\, $comb) if $comb; + # Fixup for CCC + if (defined $comb) { # $comb can be 0 + $Comb{$comb} ||= Table->New(); + $Comb{$comb}->Append($code) + } + $Number->Append($code\, $number) if length $number;
length($decimal) and ($Number{De} ||= Table->New())->Append($code) @@ -1125\,13 +1136\,11 @@ ); }
- $Comb->Write("CombiningClass.pl"); - for (keys %{ $PropValueAlias{ccc} }) { - my ($code\, $name) = @{ $PropValueAlias{ccc}{$_} }; - (my $c = Table->New())->Append($code); - $c->Write( + # $Comb->Write("CombiningClass.pl"); + for (keys %Comb) { + $Comb{$_}->Write( ["lib"\,"ccc"\,"$_.pl"]\, - "CombiningClass category '$name'" + "CombiningClass category '$_'" ); }
______________________________________________ This email has been scanned by Netintelligence http://www.netintelligence.com/email
FYI\,
There are a number of problems in mktables besides the ccc ones. I've been working on revamping mktables to correct all these\, and expect to finish it in a week.
Note that some ccc values have no names\, but should be referrable in re's\, hence the file names should be something like 0.pl\, 240.pl
FYI\,
There are a number of problems in mktables besides the ccc ones. I've been working on revamping mktables to correct all these\, and expect to finish it in a week.
Note that some ccc values have no names\, but should be referrable in re's\, hence the file names should be something like 0.pl\, 240.pl
______________________________________________ This email has been scanned by Netintelligence
http://www.netintelligence.com/email
Karl\,
Are these changes going to have an impact on Perl 5.10.0
The reason I ask is; I am working on getting the Common Local Data Repository (CLDR) http://unicode.org/cldr/ into Perl and the CLDR requires some properties listed in the auxiliary directory of the Unicode 5.0 spec. So will your improvements include that and the extracted directory.
John
PS if you are interested in the CLDR my code is currently publicly
available at http://github.com/ThePilgrim/perlcldr/tree/master
______________________________________________
This email has been scanned by Netintelligence
http://www.netintelligence.com/email
John wrote:
FYI\,
There are a number of problems in mktables besides the ccc ones. I've been working on revamping mktables to correct all these\, and expect to finish it in a week.
Note that some ccc values have no names\, but should be referrable in re's\, hence the file names should be something like 0.pl\, 240.pl
______________________________________________ This email has been scanned by Netintelligence
http://www.netintelligence.com/emailKarl\,
Are these changes going to have an impact on Perl 5.10.0
The reason I ask is; I am working on getting the Common Local Data Repository (CLDR) http://unicode.org/cldr/ into Perl and the CLDR requires some properties listed in the auxiliary directory of the Unicode 5.0 spec. So will your improvements include that and the extracted directory.
John
PS if you are interested in the CLDR my code is currently publicly
available at http://github.com/ThePilgrim/perlcldr/tree/master______________________________________________ This email has been scanned by Netintelligence http://www.netintelligence.com/email
I'm working to get all the Unicode 5.1 database files (not the test nor documentation ones) processed by mktables\, including those in the auxiliary and extracted subdirectories\, but not including the Unihan\, for which there is a CPAN module.
Since Perl 5.10.0 has already been released\, this wouldn't affect it\, but one could use this to transparently change the tables it uses in any given installation. I hope that this would be considered for inclusion in 5.10.1.
Below are code and comments that I've added to my working version of mktables that describe in detail the properties and files that aren't fully processed (I apologize for the email's folding these):
# The following are properties that are in the files that we process\, but we # don't use them. The reasons are in the comments my @skipped_properties = ( qr/^FC_NFKC$/\, # Unimplemented\, but in Unicode::Normalize qr/^Grapheme_Link$/\, # Deprecated by Unicode qr/^Other_/\, # These are used by Unicode for constructing # other properties\, and should not be exposed );
# Below are the properties that aren't fully accessible through the Perl core. # All the binary (True or False) properties are considered to be fully # accessible through regular expression property matching (\p{XX}) (so don't # appear here). Many of the rest are partially accessible through that # mechanism\, and some fully through library modules. There are several that # are accessible through .pl files that this script creates (but which as of # this writing aren't documented). The comments give the accessibility
my @ignored_properties = ( 'Age'\, # But \p{age:XX} works 'Bidi_Class'\, # But can access through Unicode::UCD 'charinfo'\, # and \p{bc:XX} works 'Bidi_Mirroring_Glyph'\, # Unimplemented 'Block'\, # But can access through # Unicode::UCD 'charblock'\, and \p{IsBLOCK} # works 'Canonical_Combining_Class'\, # But can access through # Unicode::UCD 'charinfo'\, and \p{ccc:XX} works 'Case_Folding'\, # But can access through # Unicode::UCD 'casefold'\, and /RE/i works 'Decomposition_Mapping'\, # But can access through Unicode::UCD 'charinfo' # and furnished in Decomposition.pl 'Decomposition_Type'\, # But can access through Unicode::UCD 'charinfo' # and furnished in Decomposition.pl\, and # \p{dt:XX} works 'East_Asian_Width'\, # But \p{ea:XX} works 'General_Category'\, # But can access through # Unicode::UCD 'charinfo'\, and /\p{IsCATEGORY}/ # works 'Grapheme_Cluster_Break'\, # But \p{gcb:XX} works 'Hangul_Syllable_Type'\, # But \p{hst:XX} works 'ISO_Comment'\, # But can access through Unicode::UCD 'charinfo' 'Joining_Group'\, # But \p{jg:XX} works 'Joining_Type'\, # But \p{jt:XX} works 'Line_Break'\, # But \p{lb:XX} works 'Lowercase_Mapping'\, # But can access through lc() and # Unicode::UCD 'charinfo' 'Name'\, # But can access through Unicode::UCD 'charinfo' and # Name.pl\, and inverse through \N{} 'NFC_Quick_Check'\, # But can access through Unicode::Normalize checkNFC 'NFD_Quick_Check'\, # But can access through Unicode::Normalize checkNFD 'NFKC_Quick_Check'\, # But can access through Unicode::Normalize checkNFKC 'NFKD_Quick_Check'\, # But can access through Unicode::Normalize checkNFKD 'Numeric_Type'\, # But can access through Unicode::UCD 'charinfo'\, and # \p{nt:XX} works 'Numeric_Value'\, # But can access through Unicode::UCD 'charinfo' 'Script'\, # But can access through Unicode::UCD 'charscript'\, and # \p{InSCRIPT} works 'Sentence_Break'\, # But \p{sb:XX} works
# For all the 'Simple_XXX' properties\, Perl uses the non-Simple mapping # internally for things like lc()
'Simple_Case_Folding'\, # But can access through Unicode::UCD 'casefold' 'Simple_Lowercase_Mapping'\, # But can access through Unicode::UCD 'charinfo' 'Simple_Titlecase_Mapping'\, # But can access through Unicode::UCD 'charinfo' 'Simple_Uppercase_Mapping'\, # But can access through Unicode::UCD 'charinfo'
'Titlecase_Mapping'\, # But can access through ucfirst() and # Unicode::UCD 'charinfo' 'Unicode_1_Name'\, # But can access through Unicode::UCD 'charinfo' 'Unicode_Radical_Stroke'\, # Unimplemented\, but is in CPAN: Unicode::Unihan 'Uppercase_Mapping'\, # But can access through uc() and # Unicode::UCD 'charinfo' 'Word_Break'\, # But \p{XXX} works );
# Below are files that Unicode furnishes\, but this program ignores.
my @ignored_files = ( 'ArabicShaping.txt'\, # Unimplemented\, but derived file gives \p access 'BidiMirroring.txt'\, # For glyph rendering. 'EastAsianWidth.txt'\, # Unimplemented\, but derived file gives \p access 'Index.txt'\, # An index for UnicodeData.txt 'LineBreak.txt'\, # Unimplemented\, but derived file gives \p access 'NamedSequences.txt'\, # Unimplemented\, but can be accessed through # Unicode::UCD 'namedseq' 'NamedSqProv.txt'\, # Not officially part of the Unicode standard 'NamesList.txt'\, # Just adds commentary 'NormalizationCorrections.txt'\, # Data is already in other files. 'ReadMe.txt'\, # Just comments 'StandardizedVariants.txt'\, # Only for glyph changes );
Resolved per \4B1C2CFB\.9020002@​khwilliamson\.com from Karl Williamson
Resolved per \4B1C2CFB\.9020002@​khwilliamson\.com from Karl Williamson
@obra - Status changed from 'open' to 'resolved'
Migrated from rt.perl.org#63550 (status was 'resolved')
Searchable as RT63550$