gdcc / exporter-croissant

A Dataverse external metadata exporter for Croissant
Apache License 2.0
0 stars 0 forks source link

field is being repeated over and over #1

Closed pdurbin closed 5 months ago

pdurbin commented 5 months ago

@goeffthomas pointed out (thanks!) that as of 0.1.1 we are repeating field over and over like this:

    "recordSet": [
        {
            "@type": "cr:RecordSet",
            "field": [
                {
                    "@type": "cr:Field",
                    "name": "make",
                    "description": "Make and Model",
                    "dataType": "sc:Text",
                    "source": {
                        "@id": "11",
                        "fileObject": {
                            "@id": "data/stata13-auto.dta"
                        }
                    }
                }
            ]
        },
        {
            "@type": "cr:RecordSet",
            "field": [
                {
                    "@type": "cr:Field",
                    "name": "price",
                    "description": "Price",
                    "dataType": "sc:Integer",
                    "source": {
                        "@id": "5",
                        "fileObject": {
                            "@id": "data/stata13-auto.dta"
                        }
                    }
                }
            ]
        },

Instead, we should emit field once like this:

    "recordSet": [
        {
            "@type": "cr:RecordSet",
            "field": [
                {
                    "@type": "cr:Field",
                    "name": "make",
                    "description": "Make and Model",
                    "dataType": "sc:Text",
                    "source": {
                        "@id": "11",
                        "fileObject": {
                            "@id": "data/stata13-auto.dta"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "name": "price",
                    "description": "Price",
                    "dataType": "sc:Integer",
                    "source": {
                        "@id": "5",
                        "fileObject": {
                            "@id": "data/stata13-auto.dta"
                        }
                    }
                },

That's what the spec says to do. From the spec ( https://docs.mlcommons.org/croissant/docs/croissant-spec.html#format-example ):

  "recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "images",
      "key": { "@id": "hash" },
      "field": [
        {
          "@type": "cr:Field",
          "@id": "images/image_content",
          "description": "The image content.",
          "dataType": "sc:ImageObject",
          "source": {
            "fileSet": { "@id": "image-files" },
            "extract": {
              "fileProperty": "content"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "images/hash",
          "description": "The hash of the image, as computed from YFCC-100M.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": { "@id": "image-files" },
            "extract": {
              "fileProperty": "filename"
            },
            "transform": {
              "regex": "([^\\/]*)\\.jpg"
            }
          }
          "references": { "@id": "metadata/hash" }
        },
        {
          "@type": "cr:Field",
          "@id": "images/date_taken",
          "description": "The date the photo was taken.",
          "dataType": "sc:Date",
          "source": { "@id": "metadata/datetaken" }
        }
      ]
    }
  ]

I'll make a pull request.