This PR allows markdown to be rendered for any check item on the Metadata Quality view pages. For example, here is a check that has markdown as the output and is rendered as an HTML table:
For the view to render markdown, the "type" property of the "check" in the response must be set to "markdown".
To test these changes, you can override the response in the parse method of the QualityReport collection (i.e. here). Below is a mock response where the fourth check includes markdown as the output. This resposne is the same as the one Jeanette provided in https://github.com/NCEAS/metacatui/issues/2394#issuecomment-2318655725, except that the type has been changed to markdown (something that will happen before the release of the new MDQ suite that includes markdown content)
mock response text
```js
response = {
id: "dd70021a-9643-4306-beaf-d174fdaa16a9",
timestamp: 1724958047527,
objectIdentifier: null,
result: [
{
check: {
id: "data.table-text-delimited.variables-congruent",
name: "Text delimited table variables names congruent",
description:
"Check that text delimited table variables names are congruent.",
type: "identification",
level: "REQUIRED",
environment: "python",
code: '\n\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n from metadig import find_eml_entity\n import pandas as pd\n import io \n\n manager = StoreManager(storeConfiguration) \n\n output_data = []\n status_data = []\n\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n if read_sysmeta_element(sys, "formatId") != "text/csv":\n output_data.append(f"{fname} is not a text-delimited table, skipping.")\n status_data.append("SKIP")\n continue\n # read in first KB of data\n d_read = obj.read(1024).decode(\'utf-8\')\n # find which entity file is documented in\n z = [i for i, x in enumerate(entityNames) if x == fname]\n if len(z) == 0:\n id = pid.replace(":", "-")\n z = [i for i, x in enumerate(ids) if x == id]\n if len(z) == 0:\n output_data.append(f"{fname} does not appear to be documented in the metadata.")\n status_data.append("FAILURE")\n continue\n z = z[0]\n \n # extract correct fieldDelimiter and headerLines\n fd = "," if fieldDelimiter[z] is None else fieldDelimiter[z]\n skiprows = 0 if headerLines[z] is None else int(headerLines[z]) - 1\n # try to read it in as a csv with correct metadata\n # extract column names\n try:\n df = pd.read_csv(io.StringIO(d_read), delimiter = fd, header = skiprows)\n colnames = list(df.columns)\n except Exception as e:\n output_data.append(f"{fname} is unable to be read as a table.")\n status_data.append("FAILURE")\n continue\n # extract the entity from the metadata doc and attributeNames\n ent = find_eml_entity(document, pid, fname)\n att_names = [elem.text for elem in ent.findall(".//attributeName")]\n # check if column names match documentation\n if att_names == colnames:\n output_data.append(f"{fname} variable names match documentation.")\n status_data.append("SUCCESS")\n else:\n output_data.append(f"{fname} variable names do not match documentation. Attribute names: {att_names}. Variable names: {colnames}")\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n skips = sum(x == "SKIP" for x in status_data)\n output = f"{successes} file(s) with complete variable documentation. {failures} file(s) with variable names that do not match documentation. {skips} file(s) that are not text delimited tables."\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n\n ',
library: null,
inheritState: false,
selector: [
{
name: "entityNames",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./entityName",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "objectNames",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./physical/objectName",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "ids",
xpath:
"/eml/dataset/*[self::dataTable|self::otherEntity]/@id",
namespaceAware: false,
namespace: null,
subSelector: null,
},
{
name: "fieldDelimiter",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath:
"./physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "headerLines",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./physical/dataFormat/textFormat/numHeaderLines",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
],
dialect: [
{
name: "Ecological Metadata Language",
xpath: "boolean(/*[local-name() = 'eml'])",
},
],
},
timestamp: 1724958052449,
output: [
{
value:
"5 file(s) with complete variable documentation. 0 file(s) with variable names that do not match documentation. 0 file(s) that are not text delimited tables.",
type: null,
},
],
status: "SUCCESS",
},
{
check: {
id: "data.table-text-delimited.well-formed",
name: "Text delimited table is well formed",
description: "Check that a delmited text table is well formed.",
type: "identification",
level: "REQUIRED",
environment: "python",
code: '\n\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n from metadig import find_eml_entity\n import pandas as pd\n import io \n\n manager = StoreManager(storeConfiguration) \n\n output_data = []\n status_data = []\n\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n if read_sysmeta_element(sys, "formatId") != "text/csv":\n output_data.append(f"{fname} is not a text-delimited table, skipping.")\n status_data.append("SKIP")\n continue\n # read in first KB of data\n d_read = obj.read(1024).decode(\'utf-8\')\n # find which entity file is documented in\n z = [i for i, x in enumerate(entityNames) if x == fname]\n if len(z) == 0:\n id = pid.replace(":", "-")\n z = [i for i, x in enumerate(ids) if x == id]\n if len(z) == 0:\n output_data.append(f"{fname} does not appear to be documented in the metadata.")\n status_data.append("FAILURE")\n continue\n z = z[0]\n \n # extract correct fieldDelimiter and headerLines\n fd = "," if fieldDelimiter[z] is None else fieldDelimiter[z]\n skiprows = 0 if headerLines[z] is None else int(headerLines[z]) - 1\n # try to read it in as a csv with correct metadata\n # extract column names\n try:\n df = pd.read_csv(io.StringIO(d_read), delimiter = fd, header = skiprows)\n colnames = list(df.columns)\n except Exception as e:\n output_data.append(f"{fname} is unable to be read as a table.")\n status_data.append("FAILURE")\n continue\n if isinstance(df, pd.DataFrame):\n output_data.append(f"{fname} is able to be parsed.")\n status_data.append("SUCCESS")\n else:\n output_data.append(f"{fname} cannot be parsed." + char(type(df)))\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n skips = sum(x == "SKIP" for x in status_data)\n output = f"{successes} file(s) successfully parsed, or are not text files. {failures} file(s) failed to parse. {skips} file(s) that are not text delimited tables."\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n\n ',
library: null,
inheritState: false,
selector: [
{
name: "entityNames",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./entityName",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "objectNames",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./physical/objectName",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "ids",
xpath:
"/eml/dataset/*[self::dataTable|self::otherEntity]/@id",
namespaceAware: false,
namespace: null,
subSelector: null,
},
{
name: "fieldDelimiter",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath:
"./physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "headerLines",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./physical/dataFormat/textFormat/numHeaderLines",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
],
dialect: [
{
name: "Ecological Metadata Language",
xpath: "boolean(/*[local-name() = 'eml'])",
},
],
},
timestamp: 1724958052539,
output: [
{
value:
"5 file(s) successfully parsed, or are not text files. 0 file(s) failed to parse. 0 file(s) that are not text delimited tables.",
type: null,
},
],
status: "SUCCESS",
},
{
check: {
id: "data.format.congruent",
name: "Data format congruent with formatId",
description:
"Check that a data file format matches the formatId.",
type: "identification",
level: "REQUIRED",
environment: "python",
code: '\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n import magic\n import io\n import pandas as pd\n import requests\n import xml.etree.ElementTree as ET\n import os\n\n manager = StoreManager(storeConfiguration)\n\n url = "https://cn.dataone.org/cn/v2/formats"\n response = requests.get(url)\n root = ET.fromstring(response.content)\n d1_formats = []\n # get data frame of formats and extensions\n for format in root.findall("objectFormat"):\n media_type_element = format.find(\'mediaType\')\n extension_element = format.find(\'extension\')\n format_id_element = format.find(\'formatId\')\n media_type = media_type_element.attrib[\'name\'] if media_type_element is not None and \'name\' in media_type_element.attrib else None\n extension = extension_element.text.strip() if extension_element is not None and extension_element.text else None\n format_id = format_id_element.text.strip() if format_id_element is not None and format_id_element.text else None\n d1_formats.append({\n \'format_id\': format_id,\n \'media_type\': media_type,\n \'extension\': extension\n })\n d1_formats = pd.DataFrame(d1_formats)\n output_data = []\n status_data = []\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n fid = read_sysmeta_element(sys, "formatId")\n mime_type = magic.from_buffer(obj.read(2048), mime = True)\n ext = os.path.splitext(fname)[1][1:]\n ext = "tiff" if ext == "tif" else ext\n fid = "application/netcdf" if fid == "application/x-netcdf" else fid\n mime_type = "text/csv" if ext == "csv" and mime_type == "text/plain" else mime_type\n # extract format_ids matching the media type and file extension of the file\n df_new = d1_formats[(d1_formats["media_type"] == mime_type) & (d1_formats["extension"] == ext)]\n if df_new.shape[0] == 0:\n output_data.append(f"{fname} does not have a formatId matching it\'s media type and extension.")\n status_data.append("FAILURE")\n continue\n \n if fid in df_new["format_id"].values:\n output_data.append(f"{fname}\'s formatId matches it\'s media type and extension.")\n status_data.append("SUCCESS")\n continue\n else:\n output_data.append(f"{fname}\'s formatId does not match it\'s media type and extension.")\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n output = f"{successes} file(s) with matching formats. {failures} file(s) with mismatched formats."\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n',
library: null,
inheritState: false,
selector: [
{
name: "id",
xpath: "/eml/packageId",
namespaceAware: false,
namespace: null,
subSelector: null,
},
],
dialect: [
{
name: "eml",
xpath: "boolean(/*[local-name() = 'eml'])",
},
],
},
timestamp: 1724958053249,
output: [
{
value:
"5 file(s) with matching formats. 0 file(s) with mismatched formats.",
type: null,
},
],
status: "SUCCESS",
},
{
check: {
id: "data.table-text-delimited.glimpse",
name: "Show a summary of a text-delimited table",
description: "CShow a summary of a text-delimited table.",
type: "identification",
level: "INFO",
environment: "python",
code: '\n\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n from metadig import find_eml_entity\n import pandas as pd\n import io \n\n manager = StoreManager(storeConfiguration) \n\n output_data = []\n status_data = []\n\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n if read_sysmeta_element(sys, "formatId") != "text/csv":\n output_data.append(f"{fname} is not a text-delimited table, skipping.")\n status_data.append("SKIP")\n continue\n # read in all the data\n d_read = obj.read().decode(\'utf-8\')\n # find which entity file is documented in\n z = [i for i, x in enumerate(entityNames) if x == fname]\n if len(z) == 0:\n id = pid.replace(":", "-")\n z = [i for i, x in enumerate(ids) if x == id]\n if len(z) == 0:\n output_data.append(f"{fname} does not appear to be documented in the metadata.")\n status_data.append("FAILURE")\n continue\n z = z[0]\n \n # extract correct fieldDelimiter and headerLines\n fd = "," if fieldDelimiter[z] is None else fieldDelimiter[z]\n skiprows = 0 if headerLines[z] is None else int(headerLines[z]) - 1\n # try to read it in as a csv with correct metadata\n # extract column names\n try:\n df = pd.read_csv(io.StringIO(d_read), delimiter = fd, header = skiprows)\n summary = df.describe()\n summary_md = summary.to_markdown()\n except Exception as e:\n output_data.append(f"{fname} is unable to be read as a table.")\n status_data.append("FAILURE")\n continue\n if isinstance(df, pd.DataFrame):\n output_data.append(f"**{fname}** \\n {summary_md} \\n")\n status_data.append("SUCCESS")\n else:\n output_data.append(f"{fname} cannot be parsed." + char(type(df)))\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n skips = sum(x == "SKIP" for x in status_data)\n output = "\\n".join(output_data)\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n\n ',
library: null,
inheritState: false,
selector: [
{
name: "entityNames",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./entityName",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "objectNames",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./physical/objectName",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "ids",
xpath:
"/eml/dataset/*[self::dataTable|self::otherEntity]/@id",
namespaceAware: false,
namespace: null,
subSelector: null,
},
{
name: "fieldDelimiter",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath:
"./physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
{
name: "headerLines",
xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]",
namespaceAware: false,
namespace: null,
subSelector: {
name: "...",
xpath: "./physical/dataFormat/textFormat/numHeaderLines",
namespaceAware: false,
namespace: null,
subSelector: null,
},
},
],
dialect: [
{
name: "Ecological Metadata Language",
xpath: "boolean(/*[local-name() = 'eml'])",
},
],
},
timestamp: 1724958053380,
output: [
{
value:
"**DEN13A_AA.csv** \n | | Sample | Vanillic acid (ppb) | Dehydroabeitic acid (ppb) | p-Hydrocybenzoic acid (ppb) | Syringic acid (ppb) |\n|:------|---------:|----------------------:|----------------------------:|------------------------------:|----------------------:|\n| count | 1995 | 0 | 0 | 0 | 0 |\n| mean | 998 | nan | nan | nan | nan |\n| std | 576.051 | nan | nan | nan | nan |\n| min | 1 | nan | nan | nan | nan |\n| 25% | 499.5 | nan | nan | nan | nan |\n| 50% | 998 | nan | nan | nan | nan |\n| 75% | 1496.5 | nan | nan | nan | nan |\n| max | 1995 | nan | nan | nan | nan | \n\n**DEN13A_MA.csv** \n | | Sample | Levoglucosan (ppb) | Galactasan (ppb) | Mannosan (ppb) |\n|:------|---------:|---------------------:|-------------------:|-----------------:|\n| count | 1995 | 0 | 0 | 0 |\n| mean | 998 | nan | nan | nan |\n| std | 576.051 | nan | nan | nan |\n| min | 1 | nan | nan | nan |\n| 25% | 499.5 | nan | nan | nan |\n| 50% | 998 | nan | nan | nan |\n| 75% | 1496.5 | nan | nan | nan |\n| max | 1995 | nan | nan | nan | \n\n**DEN13A_black_carbon_3_year.csv** \n | | Year | Black Carbon (ppb) |\n|:------|---------:|---------------------:|\n| count | 405 | 405 |\n| mean | 1406 | 2.702 |\n| std | 351.173 | 1.28055 |\n| min | 800 | 0.27 |\n| 25% | 1103 | 1.97 |\n| 50% | 1406 | 2.51 |\n| 75% | 1709 | 3.12 |\n| max | 2012 | 12.97 | \n\n**DEN13A_black_carbon.csv** \n | | Depth (m) | Time | Black Carbon (ppb) |\n|:------|------------:|----------:|---------------------:|\n| count | 40334 | 37448 | 40333 |\n| mean | 103.509 | 1843.75 | 3.08925 |\n| std | 59.1242 | 249.045 | 4.63216 |\n| min | 2.258 | 800.173 | -1.31 |\n| 25% | 51.1572 | 1801.97 | 0.95 |\n| 50% | 102.736 | 1957.13 | 1.93 |\n| 75% | 156.362 | 1995.84 | 3.74 |\n| max | 202.187 | 2013.08 | 170.06 | \n\n**DEN13A_black_carbon_1_year.csv** \n | | Year | Black Carbon (ppb) |\n|:------|----------:|---------------------:|\n| count | 313 | 313 |\n| mean | 1856 | 2.51364 |\n| std | 90.4995 | 1.10668 |\n| min | 1700 | 0.45 |\n| 25% | 1778 | 1.76 |\n| 50% | 1856 | 2.32 |\n| 75% | 1934 | 3.13 |\n| max | 2012 | 6.8 | \n",
type: "markdown",
},
],
status: "SUCCESS",
},
],
suiteId: "data.suite",
nodeId: "urn:node:ARCTIC",
status: null,
runStatus: "SUCCESS",
errorDescription: null,
sysmeta: {
originMemberNode: "urn:node:ARCTIC",
rightsHolder: "http://orcid.org/0000-0001-5401-7148",
groups: [],
dateUploaded: 1720047391122,
formatId: "https://eml.ecoinformatics.org/eml-2.2.0",
obsoletes: "urn:uuid:62025d04-b87d-4651-a194-9f10e36534bf",
obsoletedBy: null,
},
sequenceId: null,
runCount: null,
modified: false,
isLatest: false,
dateUploaded: 1720047391122,
obsoletes: "urn:uuid:62025d04-b87d-4651-a194-9f10e36534bf",
obsoletedBy: null,
};
```
This PR allows markdown to be rendered for any check item on the Metadata Quality view pages. For example, here is a check that has markdown as the output and is rendered as an HTML table:
For the view to render markdown, the "type" property of the "check" in the response must be set to "markdown".
To test these changes, you can override the response in the
parse
method of theQualityReport
collection (i.e. here). Below is a mock response where the fourth check includes markdown as the output. This resposne is the same as the one Jeanette provided in https://github.com/NCEAS/metacatui/issues/2394#issuecomment-2318655725, except that thetype
has been changed tomarkdown
(something that will happen before the release of the new MDQ suite that includes markdown content)mock response text
```js response = { id: "dd70021a-9643-4306-beaf-d174fdaa16a9", timestamp: 1724958047527, objectIdentifier: null, result: [ { check: { id: "data.table-text-delimited.variables-congruent", name: "Text delimited table variables names congruent", description: "Check that text delimited table variables names are congruent.", type: "identification", level: "REQUIRED", environment: "python", code: '\n\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n from metadig import find_eml_entity\n import pandas as pd\n import io \n\n manager = StoreManager(storeConfiguration) \n\n output_data = []\n status_data = []\n\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n if read_sysmeta_element(sys, "formatId") != "text/csv":\n output_data.append(f"{fname} is not a text-delimited table, skipping.")\n status_data.append("SKIP")\n continue\n # read in first KB of data\n d_read = obj.read(1024).decode(\'utf-8\')\n # find which entity file is documented in\n z = [i for i, x in enumerate(entityNames) if x == fname]\n if len(z) == 0:\n id = pid.replace(":", "-")\n z = [i for i, x in enumerate(ids) if x == id]\n if len(z) == 0:\n output_data.append(f"{fname} does not appear to be documented in the metadata.")\n status_data.append("FAILURE")\n continue\n z = z[0]\n \n # extract correct fieldDelimiter and headerLines\n fd = "," if fieldDelimiter[z] is None else fieldDelimiter[z]\n skiprows = 0 if headerLines[z] is None else int(headerLines[z]) - 1\n # try to read it in as a csv with correct metadata\n # extract column names\n try:\n df = pd.read_csv(io.StringIO(d_read), delimiter = fd, header = skiprows)\n colnames = list(df.columns)\n except Exception as e:\n output_data.append(f"{fname} is unable to be read as a table.")\n status_data.append("FAILURE")\n continue\n # extract the entity from the metadata doc and attributeNames\n ent = find_eml_entity(document, pid, fname)\n att_names = [elem.text for elem in ent.findall(".//attributeName")]\n # check if column names match documentation\n if att_names == colnames:\n output_data.append(f"{fname} variable names match documentation.")\n status_data.append("SUCCESS")\n else:\n output_data.append(f"{fname} variable names do not match documentation. Attribute names: {att_names}. Variable names: {colnames}")\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n skips = sum(x == "SKIP" for x in status_data)\n output = f"{successes} file(s) with complete variable documentation. {failures} file(s) with variable names that do not match documentation. {skips} file(s) that are not text delimited tables."\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n\n ', library: null, inheritState: false, selector: [ { name: "entityNames", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./entityName", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "objectNames", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/objectName", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "ids", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]/@id", namespaceAware: false, namespace: null, subSelector: null, }, { name: "fieldDelimiter", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "headerLines", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/dataFormat/textFormat/numHeaderLines", namespaceAware: false, namespace: null, subSelector: null, }, }, ], dialect: [ { name: "Ecological Metadata Language", xpath: "boolean(/*[local-name() = 'eml'])", }, ], }, timestamp: 1724958052449, output: [ { value: "5 file(s) with complete variable documentation. 0 file(s) with variable names that do not match documentation. 0 file(s) that are not text delimited tables.", type: null, }, ], status: "SUCCESS", }, { check: { id: "data.table-text-delimited.well-formed", name: "Text delimited table is well formed", description: "Check that a delmited text table is well formed.", type: "identification", level: "REQUIRED", environment: "python", code: '\n\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n from metadig import find_eml_entity\n import pandas as pd\n import io \n\n manager = StoreManager(storeConfiguration) \n\n output_data = []\n status_data = []\n\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n if read_sysmeta_element(sys, "formatId") != "text/csv":\n output_data.append(f"{fname} is not a text-delimited table, skipping.")\n status_data.append("SKIP")\n continue\n # read in first KB of data\n d_read = obj.read(1024).decode(\'utf-8\')\n # find which entity file is documented in\n z = [i for i, x in enumerate(entityNames) if x == fname]\n if len(z) == 0:\n id = pid.replace(":", "-")\n z = [i for i, x in enumerate(ids) if x == id]\n if len(z) == 0:\n output_data.append(f"{fname} does not appear to be documented in the metadata.")\n status_data.append("FAILURE")\n continue\n z = z[0]\n \n # extract correct fieldDelimiter and headerLines\n fd = "," if fieldDelimiter[z] is None else fieldDelimiter[z]\n skiprows = 0 if headerLines[z] is None else int(headerLines[z]) - 1\n # try to read it in as a csv with correct metadata\n # extract column names\n try:\n df = pd.read_csv(io.StringIO(d_read), delimiter = fd, header = skiprows)\n colnames = list(df.columns)\n except Exception as e:\n output_data.append(f"{fname} is unable to be read as a table.")\n status_data.append("FAILURE")\n continue\n if isinstance(df, pd.DataFrame):\n output_data.append(f"{fname} is able to be parsed.")\n status_data.append("SUCCESS")\n else:\n output_data.append(f"{fname} cannot be parsed." + char(type(df)))\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n skips = sum(x == "SKIP" for x in status_data)\n output = f"{successes} file(s) successfully parsed, or are not text files. {failures} file(s) failed to parse. {skips} file(s) that are not text delimited tables."\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n\n ', library: null, inheritState: false, selector: [ { name: "entityNames", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./entityName", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "objectNames", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/objectName", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "ids", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]/@id", namespaceAware: false, namespace: null, subSelector: null, }, { name: "fieldDelimiter", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "headerLines", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/dataFormat/textFormat/numHeaderLines", namespaceAware: false, namespace: null, subSelector: null, }, }, ], dialect: [ { name: "Ecological Metadata Language", xpath: "boolean(/*[local-name() = 'eml'])", }, ], }, timestamp: 1724958052539, output: [ { value: "5 file(s) successfully parsed, or are not text files. 0 file(s) failed to parse. 0 file(s) that are not text delimited tables.", type: null, }, ], status: "SUCCESS", }, { check: { id: "data.format.congruent", name: "Data format congruent with formatId", description: "Check that a data file format matches the formatId.", type: "identification", level: "REQUIRED", environment: "python", code: '\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n import magic\n import io\n import pandas as pd\n import requests\n import xml.etree.ElementTree as ET\n import os\n\n manager = StoreManager(storeConfiguration)\n\n url = "https://cn.dataone.org/cn/v2/formats"\n response = requests.get(url)\n root = ET.fromstring(response.content)\n d1_formats = []\n # get data frame of formats and extensions\n for format in root.findall("objectFormat"):\n media_type_element = format.find(\'mediaType\')\n extension_element = format.find(\'extension\')\n format_id_element = format.find(\'formatId\')\n media_type = media_type_element.attrib[\'name\'] if media_type_element is not None and \'name\' in media_type_element.attrib else None\n extension = extension_element.text.strip() if extension_element is not None and extension_element.text else None\n format_id = format_id_element.text.strip() if format_id_element is not None and format_id_element.text else None\n d1_formats.append({\n \'format_id\': format_id,\n \'media_type\': media_type,\n \'extension\': extension\n })\n d1_formats = pd.DataFrame(d1_formats)\n output_data = []\n status_data = []\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n fid = read_sysmeta_element(sys, "formatId")\n mime_type = magic.from_buffer(obj.read(2048), mime = True)\n ext = os.path.splitext(fname)[1][1:]\n ext = "tiff" if ext == "tif" else ext\n fid = "application/netcdf" if fid == "application/x-netcdf" else fid\n mime_type = "text/csv" if ext == "csv" and mime_type == "text/plain" else mime_type\n # extract format_ids matching the media type and file extension of the file\n df_new = d1_formats[(d1_formats["media_type"] == mime_type) & (d1_formats["extension"] == ext)]\n if df_new.shape[0] == 0:\n output_data.append(f"{fname} does not have a formatId matching it\'s media type and extension.")\n status_data.append("FAILURE")\n continue\n \n if fid in df_new["format_id"].values:\n output_data.append(f"{fname}\'s formatId matches it\'s media type and extension.")\n status_data.append("SUCCESS")\n continue\n else:\n output_data.append(f"{fname}\'s formatId does not match it\'s media type and extension.")\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n output = f"{successes} file(s) with matching formats. {failures} file(s) with mismatched formats."\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n', library: null, inheritState: false, selector: [ { name: "id", xpath: "/eml/packageId", namespaceAware: false, namespace: null, subSelector: null, }, ], dialect: [ { name: "eml", xpath: "boolean(/*[local-name() = 'eml'])", }, ], }, timestamp: 1724958053249, output: [ { value: "5 file(s) with matching formats. 0 file(s) with mismatched formats.", type: null, }, ], status: "SUCCESS", }, { check: { id: "data.table-text-delimited.glimpse", name: "Show a summary of a text-delimited table", description: "CShow a summary of a text-delimited table.", type: "identification", level: "INFO", environment: "python", code: '\n\ndef call():\n global output\n global status\n\n from metadig import StoreManager\n from metadig import read_sysmeta_element\n from metadig import find_eml_entity\n import pandas as pd\n import io \n\n manager = StoreManager(storeConfiguration) \n\n output_data = []\n status_data = []\n\n for pid in dataPids:\n obj, sys = manager.get_object(pid)\n fname = read_sysmeta_element(sys, "fileName")\n # if file is not text/csv, skip it\n if read_sysmeta_element(sys, "formatId") != "text/csv":\n output_data.append(f"{fname} is not a text-delimited table, skipping.")\n status_data.append("SKIP")\n continue\n # read in all the data\n d_read = obj.read().decode(\'utf-8\')\n # find which entity file is documented in\n z = [i for i, x in enumerate(entityNames) if x == fname]\n if len(z) == 0:\n id = pid.replace(":", "-")\n z = [i for i, x in enumerate(ids) if x == id]\n if len(z) == 0:\n output_data.append(f"{fname} does not appear to be documented in the metadata.")\n status_data.append("FAILURE")\n continue\n z = z[0]\n \n # extract correct fieldDelimiter and headerLines\n fd = "," if fieldDelimiter[z] is None else fieldDelimiter[z]\n skiprows = 0 if headerLines[z] is None else int(headerLines[z]) - 1\n # try to read it in as a csv with correct metadata\n # extract column names\n try:\n df = pd.read_csv(io.StringIO(d_read), delimiter = fd, header = skiprows)\n summary = df.describe()\n summary_md = summary.to_markdown()\n except Exception as e:\n output_data.append(f"{fname} is unable to be read as a table.")\n status_data.append("FAILURE")\n continue\n if isinstance(df, pd.DataFrame):\n output_data.append(f"**{fname}** \\n {summary_md} \\n")\n status_data.append("SUCCESS")\n else:\n output_data.append(f"{fname} cannot be parsed." + char(type(df)))\n status_data.append("FAILURE")\n\n successes = sum(x == "SUCCESS" for x in status_data)\n failures = sum(x == "FAILURE" for x in status_data)\n skips = sum(x == "SKIP" for x in status_data)\n output = "\\n".join(output_data)\n if successes > 0 and failures == 0:\n status = "SUCCESS"\n return True\n elif successes == 0 and failures > 0:\n status = "FAILURE"\n return True\n else:\n status = "FAILURE" \n return True\n\n ', library: null, inheritState: false, selector: [ { name: "entityNames", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./entityName", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "objectNames", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/objectName", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "ids", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]/@id", namespaceAware: false, namespace: null, subSelector: null, }, { name: "fieldDelimiter", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter", namespaceAware: false, namespace: null, subSelector: null, }, }, { name: "headerLines", xpath: "/eml/dataset/*[self::dataTable|self::otherEntity]", namespaceAware: false, namespace: null, subSelector: { name: "...", xpath: "./physical/dataFormat/textFormat/numHeaderLines", namespaceAware: false, namespace: null, subSelector: null, }, }, ], dialect: [ { name: "Ecological Metadata Language", xpath: "boolean(/*[local-name() = 'eml'])", }, ], }, timestamp: 1724958053380, output: [ { value: "**DEN13A_AA.csv** \n | | Sample | Vanillic acid (ppb) | Dehydroabeitic acid (ppb) | p-Hydrocybenzoic acid (ppb) | Syringic acid (ppb) |\n|:------|---------:|----------------------:|----------------------------:|------------------------------:|----------------------:|\n| count | 1995 | 0 | 0 | 0 | 0 |\n| mean | 998 | nan | nan | nan | nan |\n| std | 576.051 | nan | nan | nan | nan |\n| min | 1 | nan | nan | nan | nan |\n| 25% | 499.5 | nan | nan | nan | nan |\n| 50% | 998 | nan | nan | nan | nan |\n| 75% | 1496.5 | nan | nan | nan | nan |\n| max | 1995 | nan | nan | nan | nan | \n\n**DEN13A_MA.csv** \n | | Sample | Levoglucosan (ppb) | Galactasan (ppb) | Mannosan (ppb) |\n|:------|---------:|---------------------:|-------------------:|-----------------:|\n| count | 1995 | 0 | 0 | 0 |\n| mean | 998 | nan | nan | nan |\n| std | 576.051 | nan | nan | nan |\n| min | 1 | nan | nan | nan |\n| 25% | 499.5 | nan | nan | nan |\n| 50% | 998 | nan | nan | nan |\n| 75% | 1496.5 | nan | nan | nan |\n| max | 1995 | nan | nan | nan | \n\n**DEN13A_black_carbon_3_year.csv** \n | | Year | Black Carbon (ppb) |\n|:------|---------:|---------------------:|\n| count | 405 | 405 |\n| mean | 1406 | 2.702 |\n| std | 351.173 | 1.28055 |\n| min | 800 | 0.27 |\n| 25% | 1103 | 1.97 |\n| 50% | 1406 | 2.51 |\n| 75% | 1709 | 3.12 |\n| max | 2012 | 12.97 | \n\n**DEN13A_black_carbon.csv** \n | | Depth (m) | Time | Black Carbon (ppb) |\n|:------|------------:|----------:|---------------------:|\n| count | 40334 | 37448 | 40333 |\n| mean | 103.509 | 1843.75 | 3.08925 |\n| std | 59.1242 | 249.045 | 4.63216 |\n| min | 2.258 | 800.173 | -1.31 |\n| 25% | 51.1572 | 1801.97 | 0.95 |\n| 50% | 102.736 | 1957.13 | 1.93 |\n| 75% | 156.362 | 1995.84 | 3.74 |\n| max | 202.187 | 2013.08 | 170.06 | \n\n**DEN13A_black_carbon_1_year.csv** \n | | Year | Black Carbon (ppb) |\n|:------|----------:|---------------------:|\n| count | 313 | 313 |\n| mean | 1856 | 2.51364 |\n| std | 90.4995 | 1.10668 |\n| min | 1700 | 0.45 |\n| 25% | 1778 | 1.76 |\n| 50% | 1856 | 2.32 |\n| 75% | 1934 | 3.13 |\n| max | 2012 | 6.8 | \n", type: "markdown", }, ], status: "SUCCESS", }, ], suiteId: "data.suite", nodeId: "urn:node:ARCTIC", status: null, runStatus: "SUCCESS", errorDescription: null, sysmeta: { originMemberNode: "urn:node:ARCTIC", rightsHolder: "http://orcid.org/0000-0001-5401-7148", groups: [], dateUploaded: 1720047391122, formatId: "https://eml.ecoinformatics.org/eml-2.2.0", obsoletes: "urn:uuid:62025d04-b87d-4651-a194-9f10e36534bf", obsoletedBy: null, }, sequenceId: null, runCount: null, modified: false, isLatest: false, dateUploaded: 1720047391122, obsoletes: "urn:uuid:62025d04-b87d-4651-a194-9f10e36534bf", obsoletedBy: null, }; ```