je4 / indexer

Identify and extract Metadata from files
Apache License 2.0
3 stars 0 forks source link

MimeReader looks like it is consuming the stream (Linux) #8

Open ross-spencer opened 1 week ago

ross-spencer commented 1 week ago

If i run the default, Indexer I see that the sources are empty in the metadata for the stored files.

{
  "Path": "v1/content/data/sample_data.xml",
  "Indexer": {
    "errors": {
      "identify": "error executing (convert [XML:- json:-]) for file 'data/sample_data.xml': : exit status 1",
      "siegfried": "cannot identify file data/sample_data.xml: empty source",
      "tika": "status not ok - http://localhost:9998/meta -> 422 Unprocessable Entity: "
    },
    "mimetype": "",
    "mimetypes": [],
    "pronom": "",
    "pronoms": [],
    "size": 0,
    "metadata": {},
    "type": "",
    "subtype": ""
  }
}
{
  "Path": "v1/content/data/sample_data_broken.xml",
  "Indexer": {
    "errors": {
      "identify": "error executing (convert [XML:- json:-]) for file 'data/sample_data_broken.xml': : exit status 1",
      "siegfried": "cannot identify file data/sample_data_broken.xml: empty source",
      "tika": "status not ok - http://localhost:9998/meta -> 422 Unprocessable Entity: "
    },
    "mimetype": "",
    "mimetypes": [],
    "pronom": "",
    "pronoms": [],
    "size": 0,
    "metadata": {},
    "type": "",
    "subtype": ""
  }
}
{
  "Path": "v1/content/README.md",
  "Indexer": {
    "errors": {
      "siegfried": "cannot identify file README.md: empty source",
      "tika": "status not ok - http://localhost:9998/meta -> 422 Unprocessable Entity: "
    },
    "mimetype": "",
    "mimetypes": [],
    "pronom": "",
    "pronoms": [],
    "size": 0,
    "metadata": {},
    "type": "",
    "subtype": ""
  }
}

I noticed the stream had zero bytes when inspected manually in the different actions, and then tracing it backwards I found the mimestream was being consumed and so was zero by the time mime was identified.

if I don't check the mimetype and just delete or rework that code to something harmless like in this diff:

diff --git a/pkg/indexer/actionDispatcher.go b/pkg/indexer/actionDispatcher.go
index 02d0d94..1909313 100644
--- a/pkg/indexer/actionDispatcher.go
+++ b/pkg/indexer/actionDispatcher.go
@@ -77,11 +77,12 @@ func (ad *ActionDispatcher) Stream(sourceReader io.Reader, stateFiles []string,
        if len(stateFiles) == 0 {
                stateFiles = []string{""}
        }
-       mimeReader, err := iou.NewMimeReader(sourceReader)
+       mimeReader := sourceReader
+       replacementMime, err := iou.NewMimeReader(strings.NewReader("replacementMIME"))
        if err != nil {
                return nil, errors.Wrapf(err, "cannot create MimeReader for %s", stateFiles)
        }
-       contentType, _ := mimeReader.DetectContentType()
+       contentType, _ := replacementMime.DetectContentType()
        parts := strings.Split(contentType, ";")
        contentType = parts[0]

The actions go on to work as anticipated:

{
  "Path": "v1/content/data/sample_data.xml",
  "Indexer": {
    "errors": {
      "identify": "error executing (convert [XML:- json:-]) for file 'data/sample_data.xml': : exit status 1"
    },
    "mimetype": "application/xml",
    "mimetypes": [
      "application/xml"
    ],
    "pronom": "fmt/101",
    "pronoms": [
      "fmt/101"
    ],
    "size": 505,
    "metadata": {
      "siegfried": [
        {
          "Namespace": "pronom",
          "ID": "fmt/101",
          "Name": "Extensible Markup Language",
          "Version": "1.0",
          "MIME": "application/xml",
          "Class": "Text (Mark-up)",
          "Basis": [
            "extension match xml",
            "byte match at 0, 19"
          ],
          "Warning": ""
        }
      ],
      "tika": [
        {
          "Content-Type": "application/xml",
          "X-TIKA:Parsed-By": [
            "org.apache.tika.parser.DefaultParser",
            "org.apache.tika.parser.xml.DcXMLParser"
          ],
          "X-TIKA:Parsed-By-Full-Set": [
            "org.apache.tika.parser.DefaultParser",
            "org.apache.tika.parser.xml.DcXMLParser"
          ],
          "language": "en",
          "resourceName": "data/sample_data.xml"
        }
      ]
    },
    "type": "application",
    "subtype": "xml"
  }
}
{
  "Path": "v1/content/data/sample_data_broken.xml",
  "Indexer": {
    "errors": {
      "identify": "error executing (convert [XML:- json:-]) for file 'data/sample_data_broken.xml': : exit status 1",
      "tika": "status not ok - http://localhost:9998/meta -> 422 Unprocessable Entity: "
    },
    "mimetype": "application/xml",
    "mimetypes": [
      "application/xml"
    ],
    "pronom": "fmt/101",
    "pronoms": [
      "fmt/101"
    ],
    "size": 503,
    "metadata": {
      "siegfried": [
        {
          "Namespace": "pronom",
          "ID": "fmt/101",
          "Name": "Extensible Markup Language",
          "Version": "1.0",
          "MIME": "application/xml",
          "Class": "Text (Mark-up)",
          "Basis": [
            "extension match xml",
            "byte match at 0, 19"
          ],
          "Warning": ""
        }
      ]
    },
    "type": "application",
    "subtype": "xml"
  }
}
{
  "Path": "v1/content/README.md",
  "Indexer": {
    "mimetype": "text/markdown",
    "mimetypes": [
      "text/markdown",
      "text/x-web-markdown; charset=ISO-8859-1"
    ],
    "pronom": "fmt/1149",
    "pronoms": [
      "fmt/1149"
    ],
    "size": 171,
    "metadata": {
      "siegfried": [
        {
          "Namespace": "pronom",
          "ID": "fmt/1149",
          "Name": "Markdown",
          "Version": "",
          "MIME": "text/markdown",
          "Class": "Text (Mark-up)",
          "Basis": [
            "extension match md"
          ],
          "Warning": "match on extension only"
        }
      ],
      "tika": [
        {
          "Content-Encoding": "ISO-8859-1",
          "Content-Type": "text/x-web-markdown; charset=ISO-8859-1",
          "X-TIKA:Parsed-By": [
            "org.apache.tika.parser.DefaultParser",
            "org.apache.tika.parser.csv.TextAndCSVParser"
          ],
          "X-TIKA:Parsed-By-Full-Set": [
            "org.apache.tika.parser.DefaultParser",
            "org.apache.tika.parser.csv.TextAndCSVParser"
          ],
          "X-TIKA:detectedEncoding": "ISO-8859-1",
          "X-TIKA:encodingDetector": "UniversalEncodingDetector",
          "language": "en",
          "resourceName": "README.md"
        }
      ]
    },
    "type": "text",
    "subtype": "markdown"
  }
}

There are a couple of things here:

  1. i could be compiling to the wrong version or missing something else.
  2. if the error exists it might only be on Linux?
  3. if the error exists elsewhere it might have gone unnoticed in the gocfl-archive due to what I believe I am seeing here: https://github.com/ocfl-archive/gocfl/issues/132 providing versions are aligned.

GOCFL commit: 12be4b Indexer commit: 251595

ross-spencer commented 3 days ago

Thinking about what may cause the differences between production and locally, it could be content based. The two files used in the above testing are attached below. They are XML snippets from JHOVE, one well-formed, and the other isn't.

sample_data.zip

I am sure I have other samples causing the same issue. Many are likely to be quite small files. I can have a look at more well-behaved samples as well.

I haven't looked at the mimereader code yet to see if there's something obvious going on.

ross-spencer commented 2 days ago

To confirm, the buffer size being read is const bufSize = 512 and so for large files, e.g. a 22 MB PDF I am testing against the whole stream is not being consumed.

That being said, because the 512 bytes is taken from the stream sent to Siegfried and Convert, we are not able to extract metadata, so, for the PDF i mention, I see the following:

{
  "Path": "v1/content/data/PDF-Sample-Document-Fully-Featured-Layout_Redacted.pdf",
  "Indexer": {
    "errors": {
      "identify": "error executing (convert [PDF:- json:-]) for file 'data/PDF-Sample-Document-Fully-Featured-Layout_Redacted.pdf': : exit status
 1"
    },
    "mimetype": "application/pdf",
    "mimetypes": [
      "application/pdf"
    ],
    "pronom": "UNKNOWN",
    "pronoms": [
      "UNKNOWN"
    ],
    "size": 22723895,
    "metadata": {
      "siegfried": [
        {
          "Namespace": "pronom",
          "ID": "UNKNOWN",
          "Name": "",
          "Version": "",
          "MIME": "",
          "Class": "",
          "Basis": null,
          "Warning": "no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, fmt/17, fmt/18, fmt/19, fmt/20, fmt/95, fmt/144, fmt/145, fmt/146, fmt/147, fmt/148, fmt/157, fmt/158, fmt/276, fmt/354, fmt/476, fmt/477, fmt/478, fmt/479, fmt/480, fmt/481, fmt/488, fmt/489, fmt/490, fmt/491, fmt/492, fmt/493, fmt/558, fmt/559, fmt/560, fmt/561, fmt/562, fmt/563, fmt/564, fmt/565, fmt/1129, fmt/1451, fmt/1910, fmt/1911, fmt/1912"
        }
     ],

NB. siegfried here returns all possible extension matches but does not identify based on byte-stream.

The output without mimereader should be:

{
  "Path": "v1/content/data/PDF-Sample-Document-Fully-Featured-Layout_Redacted.pdf",
  "Indexer": {
    "mimetype": "application/pdf",
    "mimetypes": [
      "application/pdf"
    ],
    "pronom": "fmt/276",
    "pronoms": [
      "fmt/276"
    ],
    "width": 595,
    "height": 842,
    "size": 22724407,
    "metadata": {
      "identify": {
        "magick": {
          "version": "1.0",
          "image": {
            "name": "data/PDF-Sample-Document-Fully-Featured-Layout_Redacted.pdf",
            "permissions": 664,
            "format": "PDF",
            "formatDescription": "Portable Document Format",
            "mimeType": "application/pdf",
            "class": "DirectClass",
            "geometry": {
              "width": 595,
              "height": 842
            },
            "resolution": {
              "x": 72,
              "y": 72
            },
            "printSize": {
              "x": 8.26389,
              "y": 11.6944
            },
            "units": "Undefined",
            "type": "TrueColorAlpha",
            "endianness": "Undefined",
            "colorspace": "sRGB",
            "depth": 8,
            "baseDepth": 16,
            "channelDepth": {
              "alpha": 8,
              "blue": 8,
              "green": 8,
              "red": 8
            },
            "pixels": 500990,
            "imageStatistics": {
              "all": {
                "max": 65535,
                "mean": 46838.1,
                "standardDeviation": 18255.6,
                "kurtosis": -1.08291,
                "skewness": -0.951825,
                "entropy": 0.0928963
              }
            },
            "channelStatistics": {
              "alpha": {
                "min": 65535,
                "mean": 58669.2,
                "standardDeviation": 19376.1,
                "kurtosis": 4.73138,
                "skewness": 2.56488,
                "entropy": 0.127548
              },
              "blue": {
                "max": 65535,
                "mean": 59859.8,
                "standardDeviation": 18351.3,
                "kurtosis": 6.67142,
                "skewness": -2.94039,
                "entropy": 0.0850924
              },
              "green": {
                "max": 65535,
                "mean": 60313.8,
                "standardDeviation": 17646.4,
                "kurtosis": 7.6772,
                "skewness": -3.10513,
                "entropy": 0.0792711
              },
              "red": {
                "max": 65535,
                "mean": 60312.9,
                "standardDeviation": 17648.6,
                "kurtosis": 7.67478,
                "skewness": -3.10478,
                "entropy": 0.0796734
              }
            },
            "renderingIntent": "Perceptual",
            "gamma": 0.454545,
            "chromaticity": {
              "bluePrimary": {
                "x": 0.15,
                "y": 0.06
              },
              "greenPrimary": {
                "x": 0.3,
                "y": 0.6
              },
              "redPrimary": {
                "x": 0.64,
                "y": 0.33
              },
              "whitePrimary": {
                "x": 0.3127,
                "y": 0.329
              }
            },
            "matteColor": "#BDBDBDBDBDBDFFFF",
            "backgroundColor": "#FFFFFFFFFFFFFFFF",
            "borderColor": "#DFDFDFDFDFDFFFFF",
            "transparentColor": "#0000000000000000",
            "interlace": "None",
            "intensity": "Undefined",
            "compose": "Over",
            "pageGeometry": {
              "width": 595,
              "height": 842
            },
            "dispose": "Undefined",
            "compression": "Undefined",
            "orientation": "Undefined",
            "properties": {
              "date:create": "2024-11-28T12:54:13+00:00",
              "date:modify": "2024-11-28T12:54:13+00:00",
              "date:timestamp": "2024-11-28T12:54:13+00:00",
              "dc:format": "application/pdf",
              "pdf:Producer": "Adobe PDF Library 23.1.175",
              "pdf:Version": "PDF-1.7",
              "pdfx:SourceModified": "D:20230505093127",
              "signature": "3a65ed7d5772c2b766d4289a77fe443f19b80769783e44b053fd7152f5da6172",
              "xmp:CreateDate": "2023-05-05T11:34:28+02:00",
              "xmp:CreatorTool": "Acrobat PDFMaker 23 for Word",
              "xmp:MetadataDate": "2023-05-05T12:18:30+02:00",
              "xmp:ModifyDate": "2023-05-05T12:18:30+02:00",
              "xmpMM:DocumentID": "uuid:a9a30d32-5d69-4790-b5f6-1705102a20a4",
              "xmpMM:InstanceID": "uuid:9daef15c-2c83-49c7-b731-8ed1e4402929"
            },
            "profiles": {
              "xmp": {
                "length": 3459
              }
            },
            "filesize": "34057B",
            "numberPixels": "500990",
            "pixelsPerSecond": "33.4254MB",
            "userTime": "0.010u",
            "elapsedTime": "0:01.014",
            "version": "ImageMagick 6.9.12-98 Q16 x86_64 18038 https://legacy.imagemagick.org"
          }
        },
        "frames": [
          {
            "width": 595,
            "height": 842
          },
          {
            "width": 595,
            "height": 842
          },
          {
            "width": 595,
            "height": 842
          }
        ]
      },
      "siegfried": [
        {
          "Namespace": "pronom",
          "ID": "fmt/276",
          "Name": "Acrobat PDF 1.7 - Portable Document Format",
          "Version": "1.7",
          "MIME": "application/pdf",
          "Class": "Page Description",
          "Basis": [
            "extension match pdf",
            "byte match at [[0 8] [22724400 7]]"
          ],
          "Warning": ""
        }
      ],

Siegfried relies on the header being in-tact and so it seems convert too.

Tika is robust enough to still be able to function on the remaining 22mb-512bytes and output some data, however, results may be somewhat undefined on some content.

For files smaller than 512 bytes then the stream does of course get emptied and this means Siegfried will not try to return an ID based on extension.