crkn-rcdr / Digital-Preservation

Documentation and related schemas for the CRKN digital preservation system
3 stars 0 forks source link

Legacy Packaging: Clean up unused configurations. #29

Closed RussellMcOrmond closed 2 years ago

RussellMcOrmond commented 2 years ago

As we move forward with using the existing package system in a way similar to Archivematica, we have the opportunity to clean up the configurations to only include those that continue to make sense.

RussellMcOrmond commented 2 years ago

Current list: https://admin.canadiana.ca/couchview/wipmeta/tdr/configs?range=false&reduce=false

As JSON, showing only the relevant fields.

id "dfait"
{
 "_id": "dfait",
 "title": "DFAIT issues",
 "depositor": "ooe",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "componentxml": true,
 "componentpdf": true
}
id "dfait_dc"
{
 "_id": "dfait_dc",
 "title": "DFAIT monograph DC",
 "depositor": "ooe",
 "images": true,
 "type": "document",
 "dmdtype": "DC",
 "componentxml": true,
 "componentpdf": true
}
id "dfait_marc"
{
 "_id": "dfait_marc",
 "title": "DFAIT monograph MARC",
 "depositor": "ooe",
 "images": true,
 "type": "document",
 "dmdtype": "marc",
 "componentxml": true,
 "componentpdf": true
}
id "heritage"
{
 "_id": "heritage",
 "title": "Heritage Issues",
 "depositor": "oocihm",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo"
}
id "heritage_ocr"
{
 "_id": "heritage_ocr",
 "title": "Heritage Issues - OCR",
 "depositor": "oocihm",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo"
}
id "miss_times"
{
 "_id": "miss_times",
 "title": "Mississauga Times issues",
 "depositor": "omcn",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo"
}
id "news_issues"
{
 "_id": "news_issues",
 "title": "Newspaper Issues",
 "depositor": "oocihm",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "componentxml": true,
 "componentpdf": true
}
id "news_issues_no_OCR"
{
 "_id": "news_issues_no_OCR",
 "title": "Newspaper Issues (No OCR)",
 "depositor": "oocihm",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "componentxml": false,
 "componentpdf": false
}
id "nrcan"
{
 "_id": "nrcan",
 "title": "NRCAN",
 "depositor": "ooga",
 "images": true,
 "type": "document",
 "dmdtype": "DC"
}
id "nrcan_ocr"
{
 "_id": "nrcan_ocr",
 "title": "NRCAN - ocr",
 "depositor": "ooga",
 "images": true,
 "type": "document",
 "dmdtype": "DC"
}
id "numeris"
{
 "_id": "numeris",
 "title": "Numeris - Monographs",
 "depositor": "numeris",
 "images": true,
 "type": "document",
 "dmdtype": "DC"
}
id "numeris_page"
{
 "_id": "numeris_page",
 "title": "Numeris - Monographs - Page ocr",
 "depositor": "numeris",
 "images": true,
 "type": "document",
 "dmdtype": "DC"
}
id "oocihm_issues"
{
 "_id": "oocihm_issues",
 "title": "OOCIHM Issues (OCR required)",
 "depositor": "oocihm",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "componentxml": true,
 "componentpdf": true
}
id "oocihm_monog"
{
 "_id": "oocihm_monog",
 "title": "OOCIHM Monographs",
 "depositor": "oocihm",
 "images": true,
 "type": "document",
 "dmdtype": "marc",
 "componentpdf": true
}
id "oocihm_monog_DC"
{
 "_id": "oocihm_monog_DC",
 "title": "OOCIHM Monographs DC",
 "depositor": "oocihm",
 "images": true,
 "type": "document",
 "dmdtype": "DC",
 "componentpdf": true
}
id "oop_issues"
{
 "_id": "oop_issues",
 "title": "Parliament Issues",
 "depositor": "oop",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo"
}
id "oop_monog_DC"
{
 "_id": "oop_monog_DC",
 "title": "Parliament Monographs DC",
 "depositor": "oop",
 "images": true,
 "type": "document",
 "dmdtype": "DC",
 "componentpdf": true
}
id "per_issues"
{
 "_id": "per_issues",
 "title": "Periodical Issues",
 "depositor": "oocihm",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "componentxml": true,
 "componentpdf": true
}
id "qmma_issues"
{
 "_id": "qmma_issues",
 "title": "McGill Archive Issues",
 "depositor": "qmma",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo"
}
id "south_mountain"
{
 "_id": "south_mountain",
 "title": "Friends of South Mountain - Monographs",
 "depositor": "osmsdga",
 "images": true,
 "type": "document",
 "dmdtype": "DC"
}
id "student"
{
 "_id": "student",
 "title": "Student Voice",
 "depositor": "carl",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo"
}

Note: The following Mango query was used:

{
   "selector": {
      "$or": [
         {
            "dmdtype": {
               "$exists": true
            }
         },
         {
            "componentxml": {
               "$exists": true
            }
         },
         {
            "componenpdf": {
               "$exists": true
            }
         }
      ],
      "itempdf": {
         "$eq": true
      }
   },
   "fields": [
      "_id",
      "title",
      "depositor",
      "images",
      "type",
      "dmdtype",
      "itempdf",
      "componentxml",
      "componentpdf"
   ]
}
RussellMcOrmond commented 2 years ago

Wanting to discuss this with @JLoitzenbauer-CRKN and @BeS4 , to make the options clear when to use.

Looking more closely at "heritage" and "heritage_ocr":

Ideal is if we can remove the file conversion option soon, but let me know in the interim if we want to just use "heritage" for everything and remove the "heritage_ocr" config.

We would also set to disallow OCR data in those directories.

 "componentxml": false,
 "componentpdf": false
RussellMcOrmond commented 2 years ago

The following configurations have "itempdf" set to true, but we haven't been generating multi-page PDFs with our OCR software for several years. I recommend we delete these, and then create new configurations if we need to ingest more content into these depositors prior to our migration to Archivematica.

{
 "_id": "miss_times",
 "title": "Mississauga Times issues",
 "depositor": "omcn",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "itempdf": true
}
{
 "_id": "numeris",
 "title": "Numeris - Monographs",
 "depositor": "numeris",
 "images": true,
 "type": "document",
 "dmdtype": "DC",
 "itempdf": true
}
{
 "_id": "oop_issues",
 "title": "Parliament Issues",
 "depositor": "oop",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "itempdf": true
}
{
 "_id": "student",
 "title": "Student Voice",
 "depositor": "carl",
 "images": true,
 "type": "issue",
 "dmdtype": "issueinfo",
 "itempdf": true
}
RussellMcOrmond commented 2 years ago

Updated "oocihm_issues" to set:

  "componentpdf": false,
  "componentxml": false
RussellMcOrmond commented 2 years ago

Uploading the current configurations for view, and to go back to if we make a change we don't like.

wip_config.zip

RussellMcOrmond commented 2 years ago

Looking for feedback from @JLoitzenbauer-CRKN and @BeS4

RussellMcOrmond commented 2 years ago

Sent Spreadsheet from Beth. Some configs to add, many more to remove.

RussellMcOrmond commented 2 years ago

Various cleanups of configs, and renamed to make it obvious what type of metadata each is expecting.

dfait_dc
dfait_issues
dfait_marc
heritage_dc
heritage_issues
oocihm_dc
oocihm_issues
oocihm_marc
qmma_dc
qmma_issues
RussellMcOrmond commented 2 years ago

The *_marc entries were also removed.

The "i2objid" rules for translating Heritage reel ID's to the lacreel IDs we use for Heritage have been copied from heritage_ to oocihm_*

Once tested, we can also delete heritage_dc and heritage_issues.

russell@russell-XPS-13-7390:~$ curl -s http://jarlsberg.tor.c7a.ca:5984/wipmeta/oocihm_dc | json_pp
{
   "type" : "document",
   "_id" : "oocihm_dc",
   "componentpdf" : false,
   "i2objid" : [
      {
         "replace" : "lac_reel_c$1",
         "search" : "^[Cc]-(.*)$"
      },
      {
         "replace" : "lac_reel_t$1",
         "search" : "^[Tt]-(.*)$"
      },
      {
         "search" : "^[Hh]-(.*)$",
         "replace" : "lac_reel_h$1"
      }
   ],
   "depositor" : "oocihm",
   "images" : true,
   "componentxml" : false,
   "title" : "OOCIHM DC",
   "dmdtype" : "DC",
   "_rev" : "29-ee555e5bdd102376925868be48a06f2f",
   "fileconfig" : [
      {
         "regex" : "\\.jpg$",
         "class" : "jpg"
      },
      {
         "class" : "tif",
         "regex" : "\\.tif$"
      },
      {
         "class" : "jp2",
         "regex" : "\\.jp2$"
      },
      {
         "class" : "pdf",
         "regex" : "\\.pdf$"
      },
      {
         "regex" : "\\.xml$",
         "class" : "xml"
      },
      {
         "regex" : "\\.txt$",
         "ignore" : true,
         "class" : "text-ignored"
      },
      {
         "class" : "Thumbs.db-ignored",
         "ignore" : true,
         "regex" : "Thumbs\\.db$"
      }
   ]
}
russell@russell-XPS-13-7390:~$ curl -s http://jarlsberg.tor.c7a.ca:5984/wipmeta/oocihm_issues | json_pp
{
   "i2objid" : [
      {
         "replace" : "lac_reel_c$1",
         "search" : "^[Cc]-(.*)$"
      },
      {
         "replace" : "lac_reel_t$1",
         "search" : "^[Tt]-(.*)$"
      },
      {
         "replace" : "lac_reel_h$1",
         "search" : "^[Hh]-(.*)$"
      }
   ],
   "depositor" : "oocihm",
   "images" : true,
   "dmdtype" : "issueinfo",
   "_rev" : "19-c393526bb7608e4d237c36aebb10a474",
   "componentxml" : false,
   "fileconfig" : [
      {
         "regex" : "\\.jpg$",
         "class" : "jpg"
      },
      {
         "class" : "tif",
         "regex" : "\\.tif$"
      },
      {
         "regex" : "\\.jp2$",
         "class" : "jp2"
      },
      {
         "class" : "pdf",
         "regex" : "\\.pdf$"
      },
      {
         "class" : "xml",
         "regex" : "\\.xml$"
      },
      {
         "ignore" : true,
         "class" : "text-ignored",
         "regex" : "\\.txt$"
      },
      {
         "ignore" : true,
         "regex" : "Thumbs\\.db$",
         "class" : "Thumbs.db-ignored"
      }
   ],
   "_id" : "oocihm_issues",
   "componentpdf" : false,
   "title" : "OOCIHM Issues",
   "type" : "issue"
}
russell@russell-XPS-13-7390:~$ curl -s http://jarlsberg.tor.c7a.ca:5984/wipmeta/heritage_dc | json_pp
{
   "dmdtype" : "DC",
   "title" : "Heritage Dublin Core",
   "componentpdf" : false,
   "i2objid" : [
      {
         "search" : "^[Cc]-(.*)$",
         "replace" : "lac_reel_c$1"
      },
      {
         "search" : "^[Tt]-(.*)$",
         "replace" : "lac_reel_t$1"
      },
      {
         "search" : "^[Hh]-(.*)$",
         "replace" : "lac_reel_h$1"
      }
   ],
   "componentxml" : false,
   "type" : "document",
   "_id" : "heritage_dc",
   "_rev" : "18-953fe361808601f6903546cf07034a15",
   "fileconfig" : [
      {
         "regex" : "\\.jpg$",
         "class" : "jpg"
      },
      {
         "regex" : "\\.tif$",
         "class" : "tif"
      },
      {
         "regex" : "\\.jp2$",
         "class" : "jp2"
      },
      {
         "regex" : "\\.pdf$",
         "class" : "pdf"
      },
      {
         "class" : "xml",
         "regex" : "\\.xml$"
      },
      {
         "regex" : "\\.txt$",
         "ignore" : true,
         "class" : "text-ignored"
      },
      {
         "regex" : "Thumbs\\.db$",
         "ignore" : true,
         "class" : "Thumbs.db-ignored"
      }
   ],
   "images" : true,
   "depositor" : "oocihm"
}
russell@russell-XPS-13-7390:~$ curl -s http://jarlsberg.tor.c7a.ca:5984/wipmeta/heritage_issues | json_pp
{
   "depositor" : "oocihm",
   "title" : "Heritage Issues",
   "type" : "issue",
   "fileconfig" : [
      {
         "class" : "jpg",
         "regex" : "\\.jpg$"
      },
      {
         "class" : "tif",
         "regex" : "\\.tif$"
      },
      {
         "regex" : "\\.jp2$",
         "class" : "jp2"
      },
      {
         "regex" : "\\.pdf$",
         "class" : "pdf"
      },
      {
         "regex" : "\\.xml$",
         "class" : "xml"
      },
      {
         "class" : "text-ignored",
         "regex" : "\\.txt$",
         "ignore" : true
      },
      {
         "class" : "Thumbs.db-ignored",
         "regex" : "Thumbs\\.db$",
         "ignore" : true
      }
   ],
   "i2objid" : [
      {
         "search" : "^[Cc]-(.*)$",
         "replace" : "lac_reel_c$1"
      },
      {
         "replace" : "lac_reel_t$1",
         "search" : "^[Tt]-(.*)$"
      },
      {
         "search" : "^[Hh]-(.*)$",
         "replace" : "lac_reel_h$1"
      }
   ],
   "componentxml" : false,
   "images" : true,
   "dmdtype" : "issueinfo",
   "componentpdf" : false,
   "_rev" : "17-1578657b8a09f7bef37ed678b384cc2c",
   "_id" : "heritage_issues"
}
russell@russell-XPS-13-7390:~$