simonw / datasette

An open source multi-tool for exploring and publishing data
https://datasette.io
Apache License 2.0
9.41k stars 671 forks source link

Get Datasette compatible with Pyodide #1733

Closed simonw closed 2 years ago

simonw commented 2 years ago

I've already got this working as a prototype. Here are the changes I had to make:

TODO:

Goal is to be able to do the following directly in https://pyodide.org/en/stable/console.html

import micropip
await micropip.install("datasette")
from datasette.app import Datasette
ds = Datasette()
await ds.client.get("/.json")
simonw commented 2 years ago

I released a click-default-group-wheel package to solve that dependency issue. I've already upgraded sqlite-utils to that, so now you can use that in Pyodide:

python-baseconv is only used for actor cookie expiration times:

https://github.com/simonw/datasette/blob/0a7621f96f8ad14da17e7172e8a7bce24ef78966/datasette/actor_auth_cookie.py#L16-L20

Datasette never actually sets that cookie itself - it instead encourages plugins to set it in the authentication documentation here: https://docs.datasette.io/en/0.61.1/authentication.html#including-an-expiry-time

simonw commented 2 years ago

I was going to vendor baseconv.py, but then I reconsidered - what if there are plugins out there that expect import baseconv to work because they have dependend on Datasette?

I used https://cs.github.com/ and as far as I can tell there aren't any!

So I'm going to remove that dependency and work out a smarter way to do this - probably by providing a utility function within Datasette itself.

simonw commented 2 years ago

Here's the full diff I applied to Datasette to get it fully working in Pyodide:

https://github.com/simonw/datasette/compare/94a3171b01fde5c52697aeeff052e3ad4bab5391...8af32bc5b03c30b1f7a4a8cc4bd80eb7e2ee7b81

And as a visible diff:

diff --git a/datasette/app.py b/datasette/app.py
index d269372..6c0c5fc 100644
--- a/datasette/app.py
+++ b/datasette/app.py
@@ -15,7 +15,6 @@ import pkg_resources
 import re
 import secrets
 import sys
-import threading
 import traceback
 import urllib.parse
 from concurrent import futures
@@ -26,7 +25,6 @@ from itsdangerous import URLSafeSerializer
 from jinja2 import ChoiceLoader, Environment, FileSystemLoader, PrefixLoader
 from jinja2.environment import Template
 from jinja2.exceptions import TemplateNotFound
-import uvicorn

 from .views.base import DatasetteError, ureg
 from .views.database import DatabaseDownload, DatabaseView
@@ -813,7 +811,6 @@ class Datasette:
             },
             "datasette": datasette_version,
             "asgi": "3.0",
-            "uvicorn": uvicorn.__version__,
             "sqlite": {
                 "version": sqlite_version,
                 "fts_versions": fts_versions,
@@ -854,23 +851,7 @@ class Datasette:
         ]

     def _threads(self):
-        threads = list(threading.enumerate())
-        d = {
-            "num_threads": len(threads),
-            "threads": [
-                {"name": t.name, "ident": t.ident, "daemon": t.daemon} for t in threads
-            ],
-        }
-        # Only available in Python 3.7+
-        if hasattr(asyncio, "all_tasks"):
-            tasks = asyncio.all_tasks()
-            d.update(
-                {
-                    "num_tasks": len(tasks),
-                    "tasks": [_cleaner_task_str(t) for t in tasks],
-                }
-            )
-        return d
+        return {"num_threads": 0, "threads": []}

     def _actor(self, request):
         return {"actor": request.actor}
diff --git a/datasette/database.py b/datasette/database.py
index ba594a8..b50142d 100644
--- a/datasette/database.py
+++ b/datasette/database.py
@@ -4,7 +4,6 @@ from pathlib import Path
 import janus
 import queue
 import sys
-import threading
 import uuid

 from .tracer import trace
@@ -21,8 +20,6 @@ from .utils import (
 )
 from .inspect import inspect_hash

-connections = threading.local()
-
 AttachedDatabase = namedtuple("AttachedDatabase", ("seq", "name", "file"))

@@ -43,12 +40,12 @@ class Database:
         self.hash = None
         self.cached_size = None
         self._cached_table_counts = None
-        self._write_thread = None
-        self._write_queue = None
         if not self.is_mutable and not self.is_memory:
             p = Path(path)
             self.hash = inspect_hash(p)
             self.cached_size = p.stat().st_size
+        self._read_connection = None
+        self._write_connection = None

     @property
     def cached_table_counts(self):
@@ -134,60 +131,17 @@ class Database:
         return results

     async def execute_write_fn(self, fn, block=True):
-        task_id = uuid.uuid5(uuid.NAMESPACE_DNS, "datasette.io")
-        if self._write_queue is None:
-            self._write_queue = queue.Queue()
-        if self._write_thread is None:
-            self._write_thread = threading.Thread(
-                target=self._execute_writes, daemon=True
-            )
-            self._write_thread.start()
-        reply_queue = janus.Queue()
-        self._write_queue.put(WriteTask(fn, task_id, reply_queue))
-        if block:
-            result = await reply_queue.async_q.get()
-            if isinstance(result, Exception):
-                raise result
-            else:
-                return result
-        else:
-            return task_id
-
-    def _execute_writes(self):
-        # Infinite looping thread that protects the single write connection
-        # to this database
-        conn_exception = None
-        conn = None
-        try:
-            conn = self.connect(write=True)
-            self.ds._prepare_connection(conn, self.name)
-        except Exception as e:
-            conn_exception = e
-        while True:
-            task = self._write_queue.get()
-            if conn_exception is not None:
-                result = conn_exception
-            else:
-                try:
-                    result = task.fn(conn)
-                except Exception as e:
-                    sys.stderr.write("{}\n".format(e))
-                    sys.stderr.flush()
-                    result = e
-            task.reply_queue.sync_q.put(result)
+        # We always treat it as if block=True now
+        if self._write_connection is None:
+            self._write_connection = self.connect(write=True)
+            self.ds._prepare_connection(self._write_connection, self.name)
+        return fn(self._write_connection)

     async def execute_fn(self, fn):
-        def in_thread():
-            conn = getattr(connections, self.name, None)
-            if not conn:
-                conn = self.connect()
-                self.ds._prepare_connection(conn, self.name)
-                setattr(connections, self.name, conn)
-            return fn(conn)
-
-        return await asyncio.get_event_loop().run_in_executor(
-            self.ds.executor, in_thread
-        )
+        if self._read_connection is None:
+            self._read_connection = self.connect()
+            self.ds._prepare_connection(self._read_connection, self.name)
+        return fn(self._read_connection)

     async def execute(
         self,
diff --git a/setup.py b/setup.py
index 7f0562f..c41669c 100644
--- a/setup.py
+++ b/setup.py
@@ -44,20 +44,20 @@ setup(
     install_requires=[
         "asgiref>=3.2.10,<3.6.0",
         "click>=7.1.1,<8.2.0",
-        "click-default-group~=1.2.2",
+        # "click-default-group~=1.2.2",
         "Jinja2>=2.10.3,<3.1.0",
         "hupper~=1.9",
         "httpx>=0.20",
         "pint~=0.9",
         "pluggy>=1.0,<1.1",
-        "uvicorn~=0.11",
+        # "uvicorn~=0.11",
         "aiofiles>=0.4,<0.9",
         "janus>=0.6.2,<1.1",
         "asgi-csrf>=0.9",
         "PyYAML>=5.3,<7.0",
         "mergedeep>=1.1.1,<1.4.0",
         "itsdangerous>=1.1,<3.0",
-        "python-baseconv==1.2.2",
+        # "python-baseconv==1.2.2",
     ],
     entry_points="""
         [console_scripts]
simonw commented 2 years ago

Maybe I can leave uvicorn as a dependency? Installing it works OK, it only generates errors when you try to import it:

Welcome to the Pyodide terminal emulator 🐍
Python 3.10.2 (main, Apr  9 2022 20:52:01) on WebAssembly VM
Type "help", "copyright", "credits" or "license" for more information.
>>> import micropip
>>> await micropip.install("uvicorn")
>>> import uvicorn
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/site-packages/uvicorn/__init__.py", line 1, in <module>
    from uvicorn.config import Config
  File "/lib/python3.10/site-packages/uvicorn/config.py", line 8, in <module>
    import ssl
  File "/lib/python3.10/ssl.py", line 98, in <module>
    import _ssl             # if we can't import it, let the error propagate
ModuleNotFoundError: No module named '_ssl'
>>> import ssl
>>> import uvicorn
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/site-packages/uvicorn/__init__.py", line 2, in <module>
    from uvicorn.main import Server, main, run
  File "/lib/python3.10/site-packages/uvicorn/main.py", line 24, in <module>
    from uvicorn.supervisors import ChangeReload, Multiprocess
  File "/lib/python3.10/site-packages/uvicorn/supervisors/__init__.py", line 3, in <module>
    from uvicorn.supervisors.basereload import BaseReload
  File "/lib/python3.10/site-packages/uvicorn/supervisors/basereload.py", line 12, in <module>
    from uvicorn.subprocess import get_subprocess
  File "/lib/python3.10/site-packages/uvicorn/subprocess.py", line 14, in <module>
    multiprocessing.allow_connection_pickling()
  File "/lib/python3.10/multiprocessing/context.py", line 170, in allow_connection_pickling
    from . import connection
  File "/lib/python3.10/multiprocessing/connection.py", line 21, in <module>
    import _multiprocessing
ModuleNotFoundError: No module named '_multiprocessing'
>>> import multiprocessing
>>> import uvicorn
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/site-packages/uvicorn/__init__.py", line 2, in <module>
    from uvicorn.main import Server, main, run
  File "/lib/python3.10/site-packages/uvicorn/main.py", line 24, in <module>
    from uvicorn.supervisors import ChangeReload, Multiprocess
  File "/lib/python3.10/site-packages/uvicorn/supervisors/__init__.py", line 3, in <module>
    from uvicorn.supervisors.basereload import BaseReload
  File "/lib/python3.10/site-packages/uvicorn/supervisors/basereload.py", line 12, in <module>
    from uvicorn.subprocess import get_subprocess
  File "/lib/python3.10/site-packages/uvicorn/subprocess.py", line 14, in <module>
    multiprocessing.allow_connection_pickling()
  File "/lib/python3.10/multiprocessing/context.py", line 170, in allow_connection_pickling
    from . import connection
  File "/lib/python3.10/multiprocessing/connection.py", line 21, in <module>
    import _multiprocessing
ModuleNotFoundError: No module named '_multiprocessing'
>>> 

Since the import ssl trick fixed the _ssl error I was hopeful that import multiprocessing could fix the _multiprocessing one, but sadly it did not.

But it looks like i can address this issue just by making import uvicorn in app.py an optional import.

simonw commented 2 years ago

I'm going to add a Datasette setting to disable threading entirely, designed for usage in this particular case.

I thought about adding a new setting, then I noticed this:

datasette mydatabase.db --setting num_sql_threads 10

I'm going to let users set that to 0 to disable threaded execution of SQL queries.

simonw commented 2 years ago

I'll release this as a 0.62a0 as soon as it's ready, so I can start testing it out in Pyodide for real.

simonw commented 2 years ago

I got a build from the pyodide branch to work!

Welcome to the Pyodide terminal emulator 🐍
Python 3.10.2 (main, Apr  9 2022 20:52:01) on WebAssembly VM
Type "help", "copyright", "credits" or "license" for more information.
>>> import micropip
>>> await micropip.install("https://s3.amazonaws.com/simonwillison-cors-allowed-public/datasette-0.62a0-py3-none-any.whl")
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/asyncio/futures.py", line 284, in __await__
    yield self  # This tells Task to wait for completion.
  File "/lib/python3.10/asyncio/tasks.py", line 304, in __wakeup
    future.result()
  File "/lib/python3.10/asyncio/futures.py", line 201, in result
    raise self._exception
  File "/lib/python3.10/asyncio/tasks.py", line 234, in __step
    result = coro.throw(exc)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 183, in install
    transaction = await self.gather_requirements(requirements, ctx, keep_going)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 173, in gather_requirements
    await gather(*requirement_promises)
  File "/lib/python3.10/asyncio/futures.py", line 284, in __await__
    yield self  # This tells Task to wait for completion.
  File "/lib/python3.10/asyncio/tasks.py", line 304, in __wakeup
    future.result()
  File "/lib/python3.10/asyncio/futures.py", line 201, in result
    raise self._exception
  File "/lib/python3.10/asyncio/tasks.py", line 232, in __step
    result = coro.send(None)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 245, in add_requirement
    await self.add_wheel(name, wheel, version, (), ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 316, in add_wheel
    await self.add_requirement(recurs_req, ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 291, in add_requirement
    await self.add_wheel(
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 316, in add_wheel
    await self.add_requirement(recurs_req, ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 291, in add_requirement
    await self.add_wheel(
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 316, in add_wheel
    await self.add_requirement(recurs_req, ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 276, in add_requirement
    raise ValueError(
ValueError: Requested 'h11<0.13,>=0.11', but h11==0.13.0 is already installed
>>> await micropip.install("https://s3.amazonaws.com/simonwillison-cors-allowed-public/datasette-0.62a0-py3-none-any.whl")
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/asyncio/futures.py", line 284, in __await__
    yield self  # This tells Task to wait for completion.
  File "/lib/python3.10/asyncio/tasks.py", line 304, in __wakeup
    future.result()
  File "/lib/python3.10/asyncio/futures.py", line 201, in result
    raise self._exception
  File "/lib/python3.10/asyncio/tasks.py", line 234, in __step
    result = coro.throw(exc)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 183, in install
    transaction = await self.gather_requirements(requirements, ctx, keep_going)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 173, in gather_requirements
    await gather(*requirement_promises)
  File "/lib/python3.10/asyncio/futures.py", line 284, in __await__
    yield self  # This tells Task to wait for completion.
  File "/lib/python3.10/asyncio/tasks.py", line 304, in __wakeup
    future.result()
  File "/lib/python3.10/asyncio/futures.py", line 201, in result
    raise self._exception
  File "/lib/python3.10/asyncio/tasks.py", line 232, in __step
    result = coro.send(None)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 245, in add_requirement
    await self.add_wheel(name, wheel, version, (), ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 316, in add_wheel
    await self.add_requirement(recurs_req, ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 291, in add_requirement
    await self.add_wheel(
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 316, in add_wheel
    await self.add_requirement(recurs_req, ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 291, in add_requirement
    await self.add_wheel(
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 316, in add_wheel
    await self.add_requirement(recurs_req, ctx, transaction)
  File "/lib/python3.10/site-packages/micropip/_micropip.py", line 276, in add_requirement
    raise ValueError(
ValueError: Requested 'h11<0.13,>=0.11', but h11==0.13.0 is already installed
>>> await micropip.install("h11==0.12")
>>> await micropip.install("https://s3.amazonaws.com/simonwillison-cors-allowed-public/datasette-0.62a0-py3-none-any.whl")
>>> import datasette
>>> from datasette.app import Datasette
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/site-packages/datasette/app.py", line 9, in <module>
    import httpx
  File "/lib/python3.10/site-packages/httpx/__init__.py", line 2, in <module>
    from ._api import delete, get, head, options, patch, post, put, request, stream
  File "/lib/python3.10/site-packages/httpx/_api.py", line 4, in <module>
    from ._client import Client
  File "/lib/python3.10/site-packages/httpx/_client.py", line 9, in <module>
    from ._auth import Auth, BasicAuth, FunctionAuth
  File "/lib/python3.10/site-packages/httpx/_auth.py", line 10, in <module>
    from ._models import Request, Response
  File "/lib/python3.10/site-packages/httpx/_models.py", line 16, in <module>
    from ._content import ByteStream, UnattachedStream, encode_request, encode_response
  File "/lib/python3.10/site-packages/httpx/_content.py", line 17, in <module>
    from ._multipart import MultipartStream
  File "/lib/python3.10/site-packages/httpx/_multipart.py", line 7, in <module>
    from ._types import (
  File "/lib/python3.10/site-packages/httpx/_types.py", line 5, in <module>
    import ssl
  File "/lib/python3.10/ssl.py", line 98, in <module>
    import _ssl             # if we can't import it, let the error propagate
ModuleNotFoundError: No module named '_ssl'
>>> import ssl
>>> from datasette.app import Datasette
Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/lib/python3.10/site-packages/datasette/app.py", line 14, in <module>
    import pkg_resources
ModuleNotFoundError: No module named 'pkg_resources'
>>> import setuptools
>>> from datasette.app import Datasette
>>> ds = Datasette(memory=True)
>>> ds
<datasette.app.Datasette object at 0x1cc4fb8>
>>> await ds.client.get("/")
Traceback (most recent call last):
  File "/lib/python3.10/site-packages/datasette/app.py", line 1268, in route_path
    response = await view(request, send)
  File "/lib/python3.10/site-packages/datasette/views/base.py", line 134, in view
    return await self.dispatch_request(request)
  File "/lib/python3.10/site-packages/datasette/views/base.py", line 89, in dispatch_request
    await self.ds.refresh_schemas()
  File "/lib/python3.10/site-packages/datasette/app.py", line 353, in refresh_schemas
    await self._refresh_schemas()
  File "/lib/python3.10/site-packages/datasette/app.py", line 358, in _refresh_schemas
    await init_internal_db(internal_db)
  File "/lib/python3.10/site-packages/datasette/utils/internal_db.py", line 65, in init_internal_db
    await db.execute_write_script(create_tables_sql)
  File "/lib/python3.10/site-packages/datasette/database.py", line 116, in execute_write_script
    results = await self.execute_write_fn(_inner, block=block)
  File "/lib/python3.10/site-packages/datasette/database.py", line 155, in execute_write_fn
    self._write_thread.start()
  File "/lib/python3.10/threading.py", line 928, in start
    _start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread
<Response [500 Internal Server Error]>
>>> ds = Datasette(memory=True, settings={"num_sql_threads": 0})
>>> await ds.client.get("/")
<Response [200 OK]>
>>> (await ds.client.get("/")).text
'<!DOCTYPE html>\n<html>\n<head>\n    <title>Datasette: _memory</title>\n    <link rel="stylesheet" href="/-/static/app.css
?cead5a">\n    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">\n\n<link rel="alterna
te" type="application/json+datasette" href="http://localhost/.json"></head>\n<body class="index">\n<div class="not-footer">
\n<header><nav>\n    \n    \n</nav></header>\n\n\n\n    \n\n\n\n<section class="content">\n\n<h1>Datasette</h1>\n\n\n\n\n\n
    <h2 
<long output truncated>
r detailsClickedWithin = null;\n    while (target && target.tagName != \'DETAILS\') {\n        target = target.parentNode;\
n    }\n    if (target && target.tagName == \'DETAILS\') {\n        detailsClickedWithin = target;\n    }\n    Array.from(d
ocument.getElementsByTagName(\'details\')).filter(\n        (details) => details.open && details != detailsClickedWithin\n 
   ).forEach(details => details.open = false);\n});\n</script>\n\n\n\n<!-- Templates considered: *index.html -->\n</body>\n
</html>'
>>> 

That ValueError: Requested 'h11<0.13,>=0.11', but h11==0.13.0 is already installed error is annoying. I assume it's a uvicorn dependency clash of some sort, because I wasn't getting that when I removed uvicorn as a dependency.

I can avoid it by running this first though:

await micropip.install("h11==0.12")
simonw commented 2 years ago

This is good enough to push an alpha.

simonw commented 2 years ago

That alpha release works!

https://pyodide.org/en/stable/console.html

Welcome to the Pyodide terminal emulator 🐍
Python 3.10.2 (main, Apr  9 2022 20:52:01) on WebAssembly VM
Type "help", "copyright", "credits" or "license" for more information.
>>> import micropip
>>> await micropip.install("datasette==0.62a0")
>>> import ssl
>>> import setuptools
>>> from datasette.app import Datasette
>>> ds = Datasette(memory=True, settings={"num_sql_threads": 0})
>>> await ds.client.get("/.json")
<Response [200 OK]>
>>> (await ds.client.get("/.json")).json()
{'_memory': {'name': '_memory', 'hash': None, 'color': 'a6c7b9', 'path': '/_memory', 'tables_and_views_truncated': [], 'tab
les_and_views_more': False, 'tables_count': 0, 'table_rows_sum': 0, 'show_table_row_counts': False, 'hidden_table_rows_sum'
: 0, 'hidden_tables_count': 0, 'views_count': 0, 'private': False}}
>>>