jjlee / mechanize

Stateful programmatic web browsing in Python, after Andy Lester's Perl module WWW::Mechanize .
http://wwwsearch.sourceforge.net/mechanize/
618 stars 121 forks source link

br.forms(), with some webpages, can causes ValueError: invalid literal for int() with base 10: '176C' #71

Open jacopy opened 12 years ago

jacopy commented 12 years ago

In [10]: import re In [11]: import mechanize

In [12]: br = mechanize.Browser() In [13]: br.open("http://www.example.com/") Out[13]: <response_seek_wrapper at 0x3717bc0 whose wrapped object = <closeable_response at 0x371d328 whose fp = <socket._fileobject object at 0x03615E70>>> In [14]: br.forms()

ValueError                                Traceback (most recent call last)
<ipython-input-20-3214840519ff> in <module>()
----> 1 for form in br.forms():
      2     pass
c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_mechanize.pyc in forms(self)
    418         if not self.viewing_html():
    419             raise BrowserStateError("not viewing HTML")
--> 420         return self._factory.forms()
    421
    422     def global_form(self):

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_html.pyc in forms(self)
    555             try:
    556                 self._forms_genf = CachingGeneratorFunction(
--> 557                     self._forms_factory.forms())
    558             except:  # XXXX define exception!
    559                 self.set_response(self._response)

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_html.pyc in forms(self)
    235             _urljoin=_rfc3986.urljoin,
    236             _urlparse=_rfc3986.urlsplit,
--> 237             _urlunparse=_rfc3986.urlunsplit,
    238             )
    239         self.global_form = forms[0]

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in ParseResponseEx(response, select_default, form_parser_class,
request_class, entitydefs, encoding, _urljoin, _urlparse, _urlunparse)
    842                         _urljoin=_urljoin,
    843                         _urlparse=_urlparse,
--> 844                         _urlunparse=_urlunparse,
    845                         )
    846

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in _ParseFileEx(file, base_uri, select_default, ignore_errors, f
orm_parser_class, request_class, entitydefs, backwards_compat, encoding, _urljoin, _urlparse, _urlunparse)
    979         data = file.read(CHUNK)
    980         try:
--> 981             fp.feed(data)
    982         except ParseError, e:
    983             e.base_uri = base_uri

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in feed(self, data)
    756     def feed(self, data):
    757         try:
--> 758             _sgmllib_copy.SGMLParser.feed(self, data)
    759         except _sgmllib_copy.SGMLParseError, exc:
    760             raise ParseError(exc)

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_sgmllib_copy.pyc in feed(self, data)
    108
    109         self.rawdata = self.rawdata + data
--> 110         self.goahead(0)
    111
    112     def close(self):

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_sgmllib_copy.pyc in goahead(self, end)
    190                 if match:
    191                     name = match.group(1)
--> 192                     self.handle_charref(name)
    193                     i = match.end(0)
    194                     if rawdata[i-1] != ';': i = i-1

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in handle_charref(self, name)
    652     def handle_charref(self, name):
    653         #debug("%s", name)
--> 654         self.handle_data(unescape_charref(name, self._encoding))
    655
    656     def unescape_attr(self, name):

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in unescape_charref(data, encoding)
    147     if name.startswith("x"):
    148         name, base= name[1:], 16
--> 149     uc = unichr(int(name, base))
    150     if encoding is None:
    151         return uc

ValueError: invalid literal for int() with base 10: '176C'

In [21]: br = mechanize.Browser()

In [22]: br.open(url)
Out[22]: <response_seek_wrapper at 0x371acb0 whose wrapped object = <closeable_response at 0x371ac60 whose fp = <socket._fileobject object a
t 0x03615F70>>>

In [23]: for form in br.forms():
   ....:     pass
   ....:
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-3214840519ff> in <module>()
----> 1 for form in br.forms():
      2     pass

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_mechanize.pyc in forms(self)
    418         if not self.viewing_html():
    419             raise BrowserStateError("not viewing HTML")
--> 420         return self._factory.forms()
    421
    422     def global_form(self):

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_html.pyc in forms(self)
    555             try:
    556                 self._forms_genf = CachingGeneratorFunction(
--> 557                     self._forms_factory.forms())
    558             except:  # XXXX define exception!
    559                 self.set_response(self._response)

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_html.pyc in forms(self)
    235             _urljoin=_rfc3986.urljoin,
    236             _urlparse=_rfc3986.urlsplit,
--> 237             _urlunparse=_rfc3986.urlunsplit,
    238             )
    239         self.global_form = forms[0]

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in ParseResponseEx(response, select_default, form_parser_class,
request_class, entitydefs, encoding, _urljoin, _urlparse, _urlunparse)
    842                         _urljoin=_urljoin,
    843                         _urlparse=_urlparse,
--> 844                         _urlunparse=_urlunparse,
    845                         )
    846

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in _ParseFileEx(file, base_uri, select_default, ignore_errors, f
orm_parser_class, request_class, entitydefs, backwards_compat, encoding, _urljoin, _urlparse, _urlunparse)
    979         data = file.read(CHUNK)
    980         try:
--> 981             fp.feed(data)
    982         except ParseError, e:
    983             e.base_uri = base_uri

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in feed(self, data)
    756     def feed(self, data):
    757         try:
--> 758             _sgmllib_copy.SGMLParser.feed(self, data)
    759         except _sgmllib_copy.SGMLParseError, exc:
    760             raise ParseError(exc)

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_sgmllib_copy.pyc in feed(self, data)
    108
    109         self.rawdata = self.rawdata + data
--> 110         self.goahead(0)
    111
    112     def close(self):

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_sgmllib_copy.pyc in goahead(self, end)
    190                 if match:
    191                     name = match.group(1)
--> 192                     self.handle_charref(name)
    193                     i = match.end(0)
    194                     if rawdata[i-1] != ';': i = i-1

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in handle_charref(self, name)
    652     def handle_charref(self, name):
    653         #debug("%s", name)
--> 654         self.handle_data(unescape_charref(name, self._encoding))
    655
    656     def unescape_attr(self, name):

c:\python27\lib\site-packages\mechanize-0.2.5-py2.7.egg\mechanize\_form.pyc in unescape_charref(data, encoding)
    147     if name.startswith("x"):
    148         name, base= name[1:], 16
--> 149     uc = unichr(int(name, base))
    150     if encoding is None:
    151         return uc

ValueError: invalid literal for int() with base 10: '176C'
mova commented 11 years ago

Hi, i have the same Problem and it seems very common these days. I don't know whether it'll be fixed one day or not, but by then mechanize will stay useless :(