Closed dmpetrov closed 4 months ago
The fix for now is this:
diff --git a/examples/wds.py b/examples/wds.py
index 7bc2c7a..df3cc81 100644
--- a/examples/wds.py
+++ b/examples/wds.py
@@ -15,14 +15,14 @@ meta_emd = (
DataChain.from_storage("gs://dvcx-datacomp-small/metadata")
.filter(C.name.glob("0020f*.npz"))
.gen(emd=process_laion_meta)
- .map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
+ .map(stem=lambda emd: emd.file.get_file_stem(), params=["emd"], output=str)
)
meta_pq = (
DataChain.from_storage("gs://dvcx-datacomp-small/metadata")
.filter(C.name.glob("0020f*.parquet"))
.parse_parquet()
- .map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
+ .map(stem=lambda source: source.file.get_file_stem(), params=["source"], output=str)
)
meta = meta_emd.merge(
The problem is that SignalSchema does (edit: not) handle nested signals like emd.file
or source.file
. It assumes a flat dict and looks up only top-level keys like emd
or source
.
Simple reproducer:
from datachain.lib.dc import C, DataChain
source = "gs://dvcx-datalakes/dogs-and-cats/"
DataChain.from_storage(source).map(lambda x: x, params=["file.name"], output={"name": str}).show()
Description
It works for like an hour and then fails:
Version Info