MagicStack / httptools

Fast HTTP parser
MIT License
1.2k stars 83 forks source link

with curl and https #89

Open tianhuangtencent opened 1 year ago

tianhuangtencent commented 1 year ago

I try to combine pycurl and httptools, but I found it can't deal with https, here is my code:

success with http:

import pycurl
import httptools

class HttpResponse:
    def on_message_begin(self):
        print('on_message_begin')
    def on_url(self, url: bytes):
        print(f'on_url: url={url}')
    def on_header(self, name: bytes, value: bytes):
        print(f'on_header: name={name}, value={value.decode("ISO-8859-1")}')
    def on_headers_complete(self):
        print(f'on_header_complete')
    def on_body(self, body: bytes):
        print(f'on_body: {len(body)}')
    def on_message_complete(self):
        print(f'on_message_complete')
    def on_chunk_header(self):
        print(f'on_chunk_header')
    def on_chunk_complete(self):
        print(f'on_chunk_complete')
    def on_status(self, status: bytes):
        print(f'on_status: status={status}')

print(pycurl.version)
print(httptools.__version__)
m = HttpResponse()
p = httptools.HttpResponseParser(m)

c = pycurl.Curl()
c.setopt(pycurl.URL, "http://uvloop.readthedocs.io/")
c.setopt(pycurl.HTTP_TRANSFER_DECODING, 0)
c.setopt(pycurl.WRITEFUNCTION, p.feed_data)
c.setopt(pycurl.HEADERFUNCTION, p.feed_data)
c.perform()

and output:

PycURL/7.45.2 libcurl/7.76.1 OpenSSL/1.1.1u zlib/1.2.11 libssh2/1.9.0 nghttp2/1.43.0
0.5.0
on_message_begin
on_status: status=b'Found'
on_header: name=b'Date', value=Thu, 08 Jun 2023 09:50:21 GMT
on_header: name=b'Content-Type', value=text/html; charset=utf-8
on_header: name=b'Transfer-Encoding', value=chunked
on_header: name=b'Connection', value=keep-alive
on_header: name=b'Location', value=https://uvloop.readthedocs.io/
on_header: name=b'CF-Ray', value=7d403ae94955cec5-SJC
on_header: name=b'CF-Cache-Status', value=EXPIRED
on_header: name=b'Cache-Control', value=max-age=1200
on_header: name=b'Content-Language', value=en
on_header: name=b'Vary', value=Accept-Language, Cookie, Accept-Encoding
on_header: name=b'CDN-Cache-Control', value=public
on_header: name=b'Referrer-Policy', value=no-referrer-when-downgrade
on_header: name=b'X-Backend', value=web-i-0854c4793bcd745a7
on_header: name=b'X-Content-Type-Options', value=nosniff
on_header: name=b'X-RTD-Domain', value=uvloop.readthedocs.io
on_header: name=b'X-RTD-Project', value=
on_header: name=b'X-RTD-Project-Method', value=public_domain
on_header: name=b'X-RTD-Redirect', value=http_to_https
on_header: name=b'X-RTD-Version-Method', value=path
on_header: name=b'X-Served', value=Django-Proxito
on_header: name=b'X-XSS-Protection', value=1; mode=block
on_header: name=b'Server', value=cloudflare
on_header: name=b'alt-svc', value=h3=":443"; ma=86400
on_header_complete
on_chunk_header
on_chunk_complete
on_message_complete

fail with https:

import certifi
import pycurl
import httptools

class HttpResponse:
    def on_message_begin(self):
        print('on_message_begin')
    def on_url(self, url: bytes):
        print(f'on_url: url={url}')
    def on_header(self, name: bytes, value: bytes):
        print(f'on_header: name={name}, value={value.decode("ISO-8859-1")}')
    def on_headers_complete(self):
        print(f'on_header_complete')
    def on_body(self, body: bytes):
        print(f'on_body: {len(body)}')
    def on_message_complete(self):
        print(f'on_message_complete')
    def on_chunk_header(self):
        print(f'on_chunk_header')
    def on_chunk_complete(self):
        print(f'on_chunk_complete')
    def on_status(self, status: bytes):
        print(f'on_status: status={status}')

print(pycurl.version)
print(httptools.__version__)
m = HttpResponse()
p = httptools.HttpResponseParser(m)

c = pycurl.Curl()
c.setopt(pycurl.URL, "https://uvloop.readthedocs.io/")
c.setopt(c.CAINFO, certifi.where())
c.setopt(pycurl.HTTP_TRANSFER_DECODING, 0)
c.setopt(pycurl.WRITEFUNCTION, p.feed_data)
c.setopt(pycurl.HEADERFUNCTION, p.feed_data)
c.perform()

output:

PycURL/7.45.2 libcurl/7.76.1 OpenSSL/1.1.1u zlib/1.2.11 libssh2/1.9.0 nghttp2/1.43.0
0.5.0
on_message_begin

---------------------------------------------------------------------------
HttpParserError                           Traceback (most recent call last)
httptools/parser/parser.pyx in httptools.parser.parser.HttpParser.feed_data()

HttpParserError: Expected dot

---------------------------------------------------------------------------
error                                     Traceback (most recent call last)
/tmp/ipykernel_16171/3019824725.py in <module>
     37 c.setopt(pycurl.WRITEFUNCTION, p.feed_data)
     38 c.setopt(pycurl.HEADERFUNCTION, p.feed_data)
---> 39 c.perform()

error: (23, 'Failed writing header')

by the way, how can i let httptools auto decode with Accept-Encoding: gzip,deflate

fantix commented 1 year ago

llhttp does not support HTTP/2, and httptools inherits that.

Looks like https://uvloop.readthedocs.io/ (the website of readthedocs.io, not uvloop) uses HTTP/2 by default.