simonw / datasette-lite

Datasette running in your browser using WebAssembly and Pyodide
https://lite.datasette.io
Apache License 2.0
329 stars 27 forks source link

JSONL support #62

Closed simonw closed 1 year ago

simonw commented 1 year ago

The OpenAI Evals repo uses JSONL, not JSON. Would be useful to be able to load from there.

e.g. https://github.com/openai/evals/blob/main/evals/registry/data/reverse_string/reverse_string.jsonl

simonw commented 1 year ago

Interesting catch with this particular URL: it's stored in Git LFS, which means Datasette Lite can't currently figure out where the file itself is hosted.

Here the CORS-enabled file to load is https://media.githubusercontent.com/media/openai/evals/main/evals/registry/data/reverse_string/reverse_string.jsonl

simonw commented 1 year ago

Got this working with a new, undocumented ?jsonl= option:

diff --git a/index.html b/index.html
index 29d0ce7..a81563b 100644
--- a/index.html
+++ b/index.html
@@ -113,9 +113,10 @@ const initialUrl = fixUrl(urlParams.get('url'));
 const csvUrls = urlParams.getAll('csv').map(fixUrl);
 const sqlUrls = urlParams.getAll('sql').map(fixUrl);
 const jsonUrls = urlParams.getAll('json').map(fixUrl);
+const jsonlUrls = urlParams.getAll('jsonl').map(fixUrl);
 const installUrls = urlParams.getAll('install');

-datasetteWorker.postMessage({type: 'startup', initialUrl, csvUrls, sqlUrls, jsonUrls, installUrls});
+datasetteWorker.postMessage({type: 'startup', initialUrl, csvUrls, sqlUrls, jsonUrls, jsonlUrls, installUrls});

 let loadingLogs = ["Loading..."];

diff --git a/webworker.js b/webworker.js
index 7ab440d..de09b1b 100644
--- a/webworker.js
+++ b/webworker.js
@@ -10,6 +10,7 @@ async function startDatasette(settings) {
   let csvs = [];
   let sqls = [];
   let jsons = [];
+  let jsonls = [];
   let needsDataDb = false;
   let shouldLoadDefaults = true;
   if (settings.initialUrl) {
@@ -32,6 +33,11 @@ async function startDatasette(settings) {
     needsDataDb = true;
     shouldLoadDefaults = false;
   }
+  if (settings.jsonlUrls && settings.jsonlUrls.length) {
+    jsonls = settings.jsonlUrls;
+    needsDataDb = true;
+    shouldLoadDefaults = false;
+  }
   if (needsDataDb) {
     toLoad.push(["data.db", 0]);
   }
@@ -81,7 +87,8 @@ async function startDatasette(settings) {
     # Import data from ?csv=URL CSV files/?json=URL JSON files
     csvs = ${JSON.stringify(csvs)}
     jsons = ${JSON.stringify(jsons)}
-    if csvs or jsons:
+    jsonls = ${JSON.stringify(jsonls)}
+    if csvs or jsons or jsonls:
         await micropip.install("sqlite-utils==3.28")
         import sqlite_utils, json
         from sqlite_utils.utils import rows_from_file, TypeTracker, Format
@@ -109,7 +116,12 @@ async function startDatasette(settings) {
             db[bit].transform(
                 types=tracker.types
             )
-        for json_url in jsons:
+        json_to_do = [
+          ('json', url) for url in jsons
+        ] + [
+          ('jsonl', url) for url in jsonls
+        ]
+        for jtype, json_url in json_to_do:
             bit = json_url.split("/")[-1].split(".")[0].split("?")[0]
             bit = bit.strip()
             if not bit:
@@ -122,7 +134,10 @@ async function startDatasette(settings) {
             table_names.add(bit)
             response = await pyfetch(json_url)
             with open("json.json", "wb") as fp:
-                json_data = json.loads(await response.bytes())
+                if jtype == "json":
+                    json_data = json.loads(await response.bytes())
+                else:
+                    json_data = [json.loads(line) for line in (await response.bytes()).split(b"\\n")]
             # If it's an object, try to find first key that's a list of objects
             if isinstance(json_data, dict):
                 for key, value in json_data.items():
simonw commented 1 year ago

Or should I try to teach the ?json= option to also transparently spot JSONL files and load those instead?

That way I wouldn't need to add a "Load JSONL" button - the existing "Load JSON" button would work too.

simonw commented 1 year ago

Demo:

And to show I didn't break regular JSON: