JSON/TSV caching - Githubissues

Quickly doing a naive memoize:

diff --git a/bids-validator/src/files/tsv.ts b/bids-validator/src/files/tsv.ts
index 97af2cb3..2d4f6172 100644
--- a/bids-validator/src/files/tsv.ts
+++ b/bids-validator/src/files/tsv.ts
@@ -3,11 +3,19 @@
  * Module for parsing TSV
  */
 import { ColumnsMap } from '../types/columns.ts'
+import { BIDSFile } from '../types/filetree.ts'
+import { memoizeAsync } from '../utils/memoize.ts'

 const normalizeEOL = (str: string): string => str.replace(/\r\n/g, '\n').replace(/\r/g, '\n')
 // Typescript resolved `row && !/^\s*$/.test(row)` as `string | boolean`
 const isContentfulRow = (row: string): boolean => !!(row && !/^\s*$/.test(row))

+async function _loadTSV(file: BIDSFile): Promise<ColumnsMap> {
+  return await file.text().then(parseTSV)
+}
+
+export const loadTSV = memoizeAsync(_loadTSV)
+
 export function parseTSV(contents: string) {
   const columns = new ColumnsMap()
   const rows: string[][] = normalizeEOL(contents)
diff --git a/bids-validator/src/schema/associations.ts b/bids-validator/src/schema/associations.ts
index a335febd..227b4394 100644
--- a/bids-validator/src/schema/associations.ts
+++ b/bids-validator/src/schema/associations.ts
@@ -2,7 +2,7 @@ import { ContextAssociations, ContextAssociationsEvents } from '../types/context
 import { BIDSFile, FileTree } from '../types/filetree.ts'
 import { BIDSContext } from './context.ts'
 import { readEntities } from './entities.ts'
-import { parseTSV } from '../files/tsv.ts'
+import { loadTSV } from '../files/tsv.ts'
 import { parseBvalBvec } from '../files/dwi.ts'
 import { walkBack } from '../files/inheritance.ts'

@@ -25,8 +25,7 @@ const associationLookup = {
     extensions: ['.tsv'],
     inherit: true,
     load: async (file: BIDSFile): Promise<ContextAssociations['events']> => {
-      const text = await file.text()
-      const columns = parseTSV(text)
+      const columns = loadTSV(file)
       return {
         path: file.path,
         onset: columns.get('onset') || [],
@@ -40,8 +39,7 @@ const associationLookup = {
     load: async (
       file: BIDSFile,
     ): Promise<ContextAssociations['aslcontext']> => {
-      const contents = await file.text()
-      const columns = parseTSV(contents)
+      const columns = loadTSV(file)
       return {
         path: file.path,
         n_rows: columns.get('volume_type')?.length || 0,
@@ -107,8 +105,7 @@ const associationLookup = {
     extensions: ['.tsv'],
     inherit: true,
     load: async (file: BIDSFile): Promise<ContextAssociations['channels']> => {
-      const contents = await file.text()
-      const columns = parseTSV(contents)
+      const columns = loadTSV(file)
       return {
         path: file.path,
         type: columns.get('type'),
diff --git a/bids-validator/src/schema/context.ts b/bids-validator/src/schema/context.ts
index 2a1c1d3b..764999b4 100644
--- a/bids-validator/src/schema/context.ts
+++ b/bids-validator/src/schema/context.ts
@@ -11,7 +11,7 @@ import { ColumnsMap } from '../types/columns.ts'
 import { BIDSEntities, readEntities } from './entities.ts'
 import { DatasetIssues } from '../issues/datasetIssues.ts'
 import { walkBack } from '../files/inheritance.ts'
-import { parseTSV } from '../files/tsv.ts'
+import { loadTSV } from '../files/tsv.ts'
 import { loadHeader } from '../files/nifti.ts'
 import { buildAssociations } from './associations.ts'
 import { ValidatorOptions } from '../setup/options.ts'
@@ -163,9 +163,7 @@ export class BIDSContext implements Context {
       return
     }

-    this.columns = await this.file
-      .text()
-      .then((text) => parseTSV(text))
+    this.columns = await loadTSV(this.file)
       .catch((error) => {
         if (error.key) {
           this.issues.addNonSchemaIssue(error.key, [this.file])
@@ -210,8 +208,7 @@ export class BIDSContext implements Context {
       (file) => file.name === 'participants.tsv',
     )
     if (participants_tsv) {
-      const participantsText = await participants_tsv.text()
-      const participantsData = parseTSV(participantsText)
+      const participantsData = await loadTSV(participants_tsv)
       this.dataset.subjects.participant_id = participantsData[
         'participant_id'
       ] as string[]
@@ -226,8 +223,7 @@ export class BIDSContext implements Context {
       // Collect observed participant_ids
       const seen = new Set() as Set<string>
       for (const file of phenotypeFiles) {
-        const phenotypeText = await file.text()
-        const phenotypeData = parseTSV(phenotypeText)
+        const phenotypeData = await loadTSV(file)
         const participant_id = phenotypeData['participant_id'] as string[]
         if (participant_id) {
           participant_id.forEach((id) => seen.add(id))
diff --git a/bids-validator/src/utils/memoize.ts b/bids-validator/src/utils/memoize.ts
index 213c7233..227b4003 100644
--- a/bids-validator/src/utils/memoize.ts
+++ b/bids-validator/src/utils/memoize.ts
@@ -8,3 +8,14 @@ export const memoize = <T>(
   cached.cache = cache
   return cached
 }
+
+export const memoizeAsync = <T>(
+  fn: (...args: any[]) => Promise<T>,
+): (...args: any[]) => Promise<T> => {
+  const cache = new Map()
+  const cached = async function (this: any, val: T) {
+    return cache.has(val) ? cache.get(val) : cache.set(val, await fn.call(this, val)) && cache.get(val)
+  }
+  cached.cache = cache
+  return cached
+}

Results in excessive memory consumption:

❯ BIDS_SCHEMA=~/Projects/bids/specification/src/schema.json deno run --v8-flags="--max-heap-size=20480" --reload -A $HOME/Projects/bids/bids-validator/bids-validator/src/bids-validator.ts .

<--- Last few GCs --->

[677459:0x6480d98d6000]    75023 ms: Scavenge 3648.9 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 14.21 / 0.00 ms  (average mu = 0.932, current mu = 0.884) allocation failure; 
[677459:0x6480d98d6000]    75040 ms: Scavenge 3645.0 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 17.38 / 0.00 ms  (average mu = 0.932, current mu = 0.884) allocation failure; 

<--- JS stacktrace --->

#
# Fatal JavaScript out of memory: MarkCompactCollector: young object promotion failed
#
<--- Last few GCs --->

[677459:0x6480d98d6000]    75023 ms: Scavenge 3648.9 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 14.21 / 0.00 ms  (average mu = 0.932, current mu = 0.884) allocation failure; 
[677459:0x6480d98d6000]    75040 ms: Scavenge 3645.0 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 17.38 / 0.00 ms  (average mu = 0.932, current mu = 0.884) allocation failure; 

<--- JS stacktrace --->

#
# Fatal JavaScript out of memory: MarkCompactCollector: young object promotion failed
#
==== C stack trace ===============================

==== C stack trace ===============================

    deno(+0x2ba38a3) [0x6480d57428a3]
    deno(+0x2ba2b8b) [0x6480d5741b8b]
    deno(+0x2b9e448) [0x6480d573d448]
    deno(+0x2bf0471) [0x6480d578f471]    
deno(+0x2ba38a3) [0x6480d57428a3]    
deno(+0x2d9fc07) [0x6480d593ec07]    
deno(+0x2ba2b8b) [0x6480d5741b8b]    
deno(+0x2e012b3) [0x6480d59a02b3]    
deno(+0x2b9e448) [0x6480d573d448]    
deno(+0x2e00b45) [0x6480d599fb45]    
deno(+0x2bf0471) [0x6480d578f471]    
deno(+0x2e00864) [0x6480d599f864]    
deno(+0x2d9fc07) [0x6480d593ec07]    
deno(+0x2e0fa87) [0x6480d59aea87]    
deno(+0x2e012b3) [0x6480d59a02b3]    
deno(+0x2ba4f5b) [0x6480d5743f5b]    
deno(+0x2e00b45) [0x6480d599fb45]    
deno(+0x2baa91b) [0x6480d574991b]    
deno(+0x2e00864) [0x6480d599f864]    
deno(+0x2b9fdff) [0x6480d573edff]    
deno(+0x2e0f8d7) [0x6480d59ae8d7]    
/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x748628694ac3]    
deno(+0x2ba4696) [0x6480d5743696]    
/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x748628726850]    
deno(+0x2ba4bc3) [0x6480d5743bc3]
    deno(+0x2e028fc) [0x6480d59a18fc]
    deno(+0x2df2496) [0x6480d5991496]
    deno(+0x2def124) [0x6480d598e124]
    deno(+0x2da26f9) [0x6480d59416f9]
    deno(+0x2da1f94) [0x6480d5940f94]
    deno(+0x2db2db6) [0x6480d5951db6]
    deno(+0x2db287f) [0x6480d595187f]
    deno(+0x35e8e8b) [0x6480d6187e8b]
[1]    677459 trace trap (core dumped)  BIDS_SCHEMA=~/Projects/bids/specification/src/schema.json deno run  --reload

bids-standard / legacy-validator

JSON/TSV caching #2044