Closed p5pRT closed 13 years ago
This is a bug report for perl from benkasminbullock@gmail.com\, generated with the help of perlbug 1.36 running under perl 5.10.0.
The following script run on Cygwin prints out an error message
Malformed UTF-8 character (fatal) at ./wwwjdicbug.pl line 75.
However\, the UTF-8 character which is claimed to be malformed comes from a Encode::decode ('utf8'\,...) statement and then is part of a regular expression match ($3)\, so this seems to be a bug in Perl.
######### wwwjdicbug.pl
#! perl use warnings; use strict; use URI::Escape 'uri_escape_utf8'; use Encode qw/encode decode/;
package WWWJDIC; use LWP::UserAgent; use HTML::TreeBuilder; use Encode qw/encode decode/; use URI::Escape; use utf8;
my %mirrors = ( japan => 'http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi'\, ); my %dictionaries = (); my %codes = ();
sub new { my %options = @_; my $wwwjdic = {}; if ($options{mirror}) { my $mirror = lc $options{mirror}; if ($mirrors{$mirror}) { $wwwjdic->{site} = $mirrors{$mirror}; } else { print STDERR __PACKAGE__\,": unknown mirror '$options{mirror}': using Australian site\n"; } } else { $wwwjdic->{site} = $mirrors{australia}; } $wwwjdic->{user_agent} = LWP::UserAgent->new; $wwwjdic->{user_agent}->agent(__PACKAGE__); bless $wwwjdic; return $wwwjdic; }
# Parse a page of results from WWWJDIC
sub parse_results { my ($wwwjdic\, $contents) = @_; $contents = decode ('utf8'\, $contents); print $contents; my $tree = HTML::TreeBuilder->new(); $tree->parse ($contents);
my @labels = $tree->look_down ('_tag'\, 'label'); my @inputs = $tree->look_down ('_tag'\, 'input'); my %fors; my @valid; for my $input (@inputs) { if ($input->attr('name') && $input->attr('name') eq 'jukugosel' && $input->attr('id')) { $fors{$input->attr('id')} = $input; } } @valid = grep {$fors{$_->attr('for')}} @labels; for my $line (@valid) { my %results; $results{wwwjdic_id} = $line->attr('id'); my $text = $line->as_text; print $text\,"\n"; $results{text} = $text; if ($text =~ /^(.*?)\s*$B!Z(B\s*(.*?)\s*$B![(B\s*(.*?)\s*$/) { $results{kanji} = $1; $results{reading} = $2; $results{meaning} = $3; } else { print "Unreadable line '$text'\n"; } # Get the dictionary from the end of the string. if ($results{meaning} && $results{meaning} =~ /(.*?)\s*([A-Z]{2}[12]?)\s*$/s) { $results{meaning} = $1; $results{dictionary} = $2; } } }
sub lookup_url { my ($wwwjdic\, $search_key\, $search_type) = @_; my %type; for (@$search_type) { $type{max} = $_ if /^\d+$/; } my $url = $wwwjdic->{site}; $url .= "?MMUJ"; my $search_key_encoded = URI::Escape::uri_escape_utf8 ($search_key); $url .= $search_key_encoded; $url .= "_3"; $url .= '_' . $type{max} if $type{max}; return $url; }
sub lookup { my ($wwwjdic\, $search_key\, $search_type) = @_; my $search_string = $wwwjdic->lookup_url ($search_key\, $search_type); return if !$search_string; my $response = $wwwjdic->{user_agent}->get ($search_string); if ($response->is_success) { return $wwwjdic->parse_results ($response->content); } }
sub lookup_kanji { my ($wwwjdic\, $search_key\, $search_type) = @_; my $search_string = $wwwjdic->lookup_url ($search_key\, $search_type);
}
1;
package main;
my $wwwjdic = WWWJDIC::new(mirror => 'japan'); binmode STDOUT\, ":encoding(cp932)"; my $arg = '$BAk8}(B'; $arg =~ s/^\s+|\s+$//g; print "Looking up $arg in WWWJDIC:\n"; $wwwjdic->lookup ($arg\,[20]);
#### Output of ./wwwjdicbug.pl > bug.txt 2>&1
Looking up $BAk8}(B in WWWJDIC: \<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> \ \
\<META http-equiv="Content-Type" content="text/html; charset=UTF-8">\\<img
src="http://www.aa.tufs.ac.jp/~jwb/jim_th.jpg" align="left">\<span
style="font-size: 9pt; font-family: Helvetica\, sans-serif; color:
#FFFFFF">Jim Breen's \\ \\WWWJDIC\\\ |
\\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?1C_3_20">Word Search/Home\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?9T_3_20">Translate Words\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?1B_3_20">Kanji Lookup\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?1R_3_20">Multi-Radical Kanji\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/wwwjdicinf.html">User Guide\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/wwwjdicinf.html#dicfil_tag">Dictionaries\ \ | \
\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?10">Example Search\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?17_3_20">New Entry/Amendment\ \ | \<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?14">New Examples\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/cgi-bin/wwwjdic.cgi?19B">Customize\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/wwwjdicinf.html#code_tag">Dictionary Codes\ \ | \\<a href="http://www.aa.tufs.ac.jp/~jwb/wwwjdicinf.html#don_tag">Donations\ \ | \
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562616" CHECKED
ID="5562616">\\<a
href="http://www.google.com/search?q=%22%C1%EB%B8%FD%22&hl=en&lr=lang_ja&ie=euc-jp">[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562620"
ID="5562620">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%A4%CE%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%A4%CE&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%A4%CE/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562621"
ID="5562621">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%A4%CE%B7%B8%B0%F7%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%A4%CE%B7%B8%B0%F7&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%A4%CE%B7%B8%B0%F7/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562622"
ID="5562622">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%B1%FC%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%B1%FC&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%B1%FC/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562623"
ID="5562623">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%B5%AC%C0%A9%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%B5%AC%C0%A9&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%B5%AC%C0%A9/EUC-JP/">[A]\\<a
href="http://ja.wikipedia.org/wiki/%E7%AA%93%E5%8F%A3%E8%A6%8F%E5%88%B6">[W]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562624"
ID="5562624">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%B6%C8%CC%B3%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%B6%C8%CC%B3&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%B6%C8%CC%B3/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562625"
ID="5562625">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%BF%A6%B0%F7%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%BF%A6%B0%F7&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%BF%A6%B0%F7/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562626"
ID="5562626">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%BF%F4%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%BF%F4&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%BF%F4/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562627"
ID="5562627">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%C1%B0%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%C1%B0&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%C1%B0/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562628"
ID="5562628">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%C6%E2%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%C6%E2&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%C6%E2/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562629"
ID="5562629">\\<a
href="http://www.google.com/search?q=%22%C1%EB%B8%FD%C8%CE%C7%E4%22&hl=en&lr=lang_ja&ie=euc-jp">[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%C8%CE%C7%E4%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%C8%CE%C7%E4&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%C8%CE%C7%E4/EUC-JP/">[A]\
\
\<INPUT TYPE="radio" NAME="jukugosel" VALUE="5562630"
ID="5562630">\\[G]\\<a
href="http://images.google.com/images?q=%22%C1%EB%B8%FD%CC%F2%22&hl=en&ie=euc-jp">[GI]\\<a
href="http://dictionary.goo.ne.jp/search.php?MT=%C1%EB%B8%FD%CC%F2&kind=je&mode=1">[S]\\<a
href="http://eow.alc.co.jp/%C1%EB%B8%FD%CC%F2/EUC-JP/">[A]\
\
\
\
Migrated from rt.perl.org#75680 (status was 'resolved')
Searchable as RT75680$