Closed effigies closed 4 months ago
Quickly doing a naive memoize:
diff --git a/bids-validator/src/files/tsv.ts b/bids-validator/src/files/tsv.ts
index 97af2cb3..2d4f6172 100644
--- a/bids-validator/src/files/tsv.ts
+++ b/bids-validator/src/files/tsv.ts
@@ -3,11 +3,19 @@
* Module for parsing TSV
*/
import { ColumnsMap } from '../types/columns.ts'
+import { BIDSFile } from '../types/filetree.ts'
+import { memoizeAsync } from '../utils/memoize.ts'
const normalizeEOL = (str: string): string => str.replace(/\r\n/g, '\n').replace(/\r/g, '\n')
// Typescript resolved `row && !/^\s*$/.test(row)` as `string | boolean`
const isContentfulRow = (row: string): boolean => !!(row && !/^\s*$/.test(row))
+async function _loadTSV(file: BIDSFile): Promise<ColumnsMap> {
+ return await file.text().then(parseTSV)
+}
+
+export const loadTSV = memoizeAsync(_loadTSV)
+
export function parseTSV(contents: string) {
const columns = new ColumnsMap()
const rows: string[][] = normalizeEOL(contents)
diff --git a/bids-validator/src/schema/associations.ts b/bids-validator/src/schema/associations.ts
index a335febd..227b4394 100644
--- a/bids-validator/src/schema/associations.ts
+++ b/bids-validator/src/schema/associations.ts
@@ -2,7 +2,7 @@ import { ContextAssociations, ContextAssociationsEvents } from '../types/context
import { BIDSFile, FileTree } from '../types/filetree.ts'
import { BIDSContext } from './context.ts'
import { readEntities } from './entities.ts'
-import { parseTSV } from '../files/tsv.ts'
+import { loadTSV } from '../files/tsv.ts'
import { parseBvalBvec } from '../files/dwi.ts'
import { walkBack } from '../files/inheritance.ts'
@@ -25,8 +25,7 @@ const associationLookup = {
extensions: ['.tsv'],
inherit: true,
load: async (file: BIDSFile): Promise<ContextAssociations['events']> => {
- const text = await file.text()
- const columns = parseTSV(text)
+ const columns = loadTSV(file)
return {
path: file.path,
onset: columns.get('onset') || [],
@@ -40,8 +39,7 @@ const associationLookup = {
load: async (
file: BIDSFile,
): Promise<ContextAssociations['aslcontext']> => {
- const contents = await file.text()
- const columns = parseTSV(contents)
+ const columns = loadTSV(file)
return {
path: file.path,
n_rows: columns.get('volume_type')?.length || 0,
@@ -107,8 +105,7 @@ const associationLookup = {
extensions: ['.tsv'],
inherit: true,
load: async (file: BIDSFile): Promise<ContextAssociations['channels']> => {
- const contents = await file.text()
- const columns = parseTSV(contents)
+ const columns = loadTSV(file)
return {
path: file.path,
type: columns.get('type'),
diff --git a/bids-validator/src/schema/context.ts b/bids-validator/src/schema/context.ts
index 2a1c1d3b..764999b4 100644
--- a/bids-validator/src/schema/context.ts
+++ b/bids-validator/src/schema/context.ts
@@ -11,7 +11,7 @@ import { ColumnsMap } from '../types/columns.ts'
import { BIDSEntities, readEntities } from './entities.ts'
import { DatasetIssues } from '../issues/datasetIssues.ts'
import { walkBack } from '../files/inheritance.ts'
-import { parseTSV } from '../files/tsv.ts'
+import { loadTSV } from '../files/tsv.ts'
import { loadHeader } from '../files/nifti.ts'
import { buildAssociations } from './associations.ts'
import { ValidatorOptions } from '../setup/options.ts'
@@ -163,9 +163,7 @@ export class BIDSContext implements Context {
return
}
- this.columns = await this.file
- .text()
- .then((text) => parseTSV(text))
+ this.columns = await loadTSV(this.file)
.catch((error) => {
if (error.key) {
this.issues.addNonSchemaIssue(error.key, [this.file])
@@ -210,8 +208,7 @@ export class BIDSContext implements Context {
(file) => file.name === 'participants.tsv',
)
if (participants_tsv) {
- const participantsText = await participants_tsv.text()
- const participantsData = parseTSV(participantsText)
+ const participantsData = await loadTSV(participants_tsv)
this.dataset.subjects.participant_id = participantsData[
'participant_id'
] as string[]
@@ -226,8 +223,7 @@ export class BIDSContext implements Context {
// Collect observed participant_ids
const seen = new Set() as Set<string>
for (const file of phenotypeFiles) {
- const phenotypeText = await file.text()
- const phenotypeData = parseTSV(phenotypeText)
+ const phenotypeData = await loadTSV(file)
const participant_id = phenotypeData['participant_id'] as string[]
if (participant_id) {
participant_id.forEach((id) => seen.add(id))
diff --git a/bids-validator/src/utils/memoize.ts b/bids-validator/src/utils/memoize.ts
index 213c7233..227b4003 100644
--- a/bids-validator/src/utils/memoize.ts
+++ b/bids-validator/src/utils/memoize.ts
@@ -8,3 +8,14 @@ export const memoize = <T>(
cached.cache = cache
return cached
}
+
+export const memoizeAsync = <T>(
+ fn: (...args: any[]) => Promise<T>,
+): (...args: any[]) => Promise<T> => {
+ const cache = new Map()
+ const cached = async function (this: any, val: T) {
+ return cache.has(val) ? cache.get(val) : cache.set(val, await fn.call(this, val)) && cache.get(val)
+ }
+ cached.cache = cache
+ return cached
+}
Results in excessive memory consumption:
❯ BIDS_SCHEMA=~/Projects/bids/specification/src/schema.json deno run --v8-flags="--max-heap-size=20480" --reload -A $HOME/Projects/bids/bids-validator/bids-validator/src/bids-validator.ts .
<--- Last few GCs --->
[677459:0x6480d98d6000] 75023 ms: Scavenge 3648.9 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 14.21 / 0.00 ms (average mu = 0.932, current mu = 0.884) allocation failure;
[677459:0x6480d98d6000] 75040 ms: Scavenge 3645.0 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 17.38 / 0.00 ms (average mu = 0.932, current mu = 0.884) allocation failure;
<--- JS stacktrace --->
#
# Fatal JavaScript out of memory: MarkCompactCollector: young object promotion failed
#
<--- Last few GCs --->
[677459:0x6480d98d6000] 75023 ms: Scavenge 3648.9 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 14.21 / 0.00 ms (average mu = 0.932, current mu = 0.884) allocation failure;
[677459:0x6480d98d6000] 75040 ms: Scavenge 3645.0 (3671.0) -> 3645.0 (3671.0) MB, pooled: 0 MB, 17.38 / 0.00 ms (average mu = 0.932, current mu = 0.884) allocation failure;
<--- JS stacktrace --->
#
# Fatal JavaScript out of memory: MarkCompactCollector: young object promotion failed
#
==== C stack trace ===============================
==== C stack trace ===============================
deno(+0x2ba38a3) [0x6480d57428a3]
deno(+0x2ba2b8b) [0x6480d5741b8b]
deno(+0x2b9e448) [0x6480d573d448]
deno(+0x2bf0471) [0x6480d578f471]
deno(+0x2ba38a3) [0x6480d57428a3]
deno(+0x2d9fc07) [0x6480d593ec07]
deno(+0x2ba2b8b) [0x6480d5741b8b]
deno(+0x2e012b3) [0x6480d59a02b3]
deno(+0x2b9e448) [0x6480d573d448]
deno(+0x2e00b45) [0x6480d599fb45]
deno(+0x2bf0471) [0x6480d578f471]
deno(+0x2e00864) [0x6480d599f864]
deno(+0x2d9fc07) [0x6480d593ec07]
deno(+0x2e0fa87) [0x6480d59aea87]
deno(+0x2e012b3) [0x6480d59a02b3]
deno(+0x2ba4f5b) [0x6480d5743f5b]
deno(+0x2e00b45) [0x6480d599fb45]
deno(+0x2baa91b) [0x6480d574991b]
deno(+0x2e00864) [0x6480d599f864]
deno(+0x2b9fdff) [0x6480d573edff]
deno(+0x2e0f8d7) [0x6480d59ae8d7]
/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x748628694ac3]
deno(+0x2ba4696) [0x6480d5743696]
/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x748628726850]
deno(+0x2ba4bc3) [0x6480d5743bc3]
deno(+0x2e028fc) [0x6480d59a18fc]
deno(+0x2df2496) [0x6480d5991496]
deno(+0x2def124) [0x6480d598e124]
deno(+0x2da26f9) [0x6480d59416f9]
deno(+0x2da1f94) [0x6480d5940f94]
deno(+0x2db2db6) [0x6480d5951db6]
deno(+0x2db287f) [0x6480d595187f]
deno(+0x35e8e8b) [0x6480d6187e8b]
[1] 677459 trace trap (core dumped) BIDS_SCHEMA=~/Projects/bids/specification/src/schema.json deno run --reload
Sidecars and associated files may be loaded from disk once for each file they apply to and in themselves. It would be good to cache these to avoid both the IO and parsing costs of re-execution. However, in a dataset with many files and sidecars in the leaf directories instead of the root, a full memoization would use an excessive amount of memory.
Consider a two-level cache:
As we walk the file tree, we could clear, for example
/sub-01
when we move on to/sub-02
, but/T1w.json
will be in the/
subcache, so it would not be reloaded.