TAMULib / fw-registry

MIT License
0 stars 2 forks source link

[Issue #350]: Duplicate Instance Report Workflow #394

Closed wwelling closed 3 weeks ago

wwelling commented 4 weeks ago

This workflow should identify duplicate instances based on the following comparisons:

The output of this workflow should be a report in the following format:

Screenshot 2024-09-11 at 10 26 02 AM

This is closely followed excluding the Match in the column headers and call number column will not be included. Additional columns comparing title and author have been added as well.

OCLC Match

ISBN Match

ISSN Match

LCCN Match

Call Number Match

Caveats:

  1. There are far too many call number matches to include in this report
  2. ISBN require additional normalization to handle semi-colon and dollar sign
  3. ISSN only had content for ord:1

Here is the instance duplicate report query excluding matches on call number:

WITH oclc_with_title AS (
  SELECT ie.instance_hrid, LTRIM(REGEXP_REPLACE(SUBSTRING(sm.content FROM 8), '[^0-9]', '', 'g'), '0') AS oclc, ie.title
  FROM folio_reporting.instance_ext ie
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE sm.field = '035' AND sm.ord = 1 AND sm.sf IN ('a', 'z') AND sm.content LIKE '(OCoLC)%'
),
oclc AS (
  SELECT oclcwt.instance_hrid, oclcwt.oclc, oclcwt.title, sm.content AS author
  FROM oclc_with_title oclcwt
  JOIN public.srs_marctab sm ON oclcwt.instance_hrid = sm.instance_hrid
  WHERE sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
oclc_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'oclc' AS match_type
  FROM oclc l
  JOIN oclc r ON l.oclc = r.oclc AND l.instance_hrid < r.instance_hrid
),
isbn AS (
  SELECT ie.instance_hrid, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(ii.identifier, ' .*', ''), 10), 9), ':') AS isbn, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.instance_identifiers ii ON ie.instance_hrid = ii.instance_hrid
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE ii.identifier_type_name = 'ISBN' AND ii.identifier NOT SIMILAR TO '(:|$)%'
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
isbn_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'isbn' AS match_type
  FROM isbn l
  JOIN isbn r ON l.isbn = r.isbn AND l.instance_hrid < r.instance_hrid
),
lccn AS (
  SELECT ie.instance_hrid, ii.identifier AS lccn, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.instance_identifiers ii ON ie.instance_hrid = ii.instance_hrid
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE ii.identifier_type_name = 'LCCN'
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
lccn_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'lccn' AS match_type
  FROM lccn l
  JOIN lccn r ON l.lccn = r.lccn  AND l.instance_hrid < r.instance_hrid
),
issn_with_title AS (
  SELECT ie.instance_hrid, sm.content AS issn, ie.title
  FROM folio_reporting.instance_ext ie
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE field = '022' AND ord = 1 AND sf = 'a'
),
issn AS (
  SELECT issnwt.instance_hrid, issnwt.issn, issnwt.title, sm.content AS author
  FROM issn_with_title issnwt
  JOIN public.srs_marctab sm ON issnwt.instance_hrid = sm.instance_hrid
  WHERE sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
issn_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'issn' AS match_type
  FROM issn l
  JOIN issn r ON l.issn = r.issn AND l.instance_hrid < r.instance_hrid
),
all_matches AS (
  SELECT hrid, hrid2, title, title2, author, author2, match_type FROM oclc_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM isbn_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM lccn_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM issn_matches
)
SELECT
  hrid AS HRID,
  hrid2 AS HRID2,
  MAX(CASE WHEN match_type = 'oclc' THEN 'T' END) AS OCLC,
  MAX(CASE WHEN match_type = 'isbn' THEN 'T' END) AS ISBN,
  MAX(CASE WHEN match_type = 'issn' THEN 'T' END) AS ISSN,
  MAX(CASE WHEN match_type = 'lccn' THEN 'T' END) AS LCCN,
  '"' || REPLACE(title, '"', '""') || '"' AS TITLE,
  '"' || REPLACE(title2, '"', '""') || '"' AS TITLE2,
  '"' || REPLACE(author, '"', '""') || '"' AS AUTHOR,
  '"' || REPLACE(author2, '"', '""') || '"' AS AUTHOR2
FROM all_matches
GROUP BY hrid, hrid2, title, title2, author, author2;

Here is the complete instance duplication report query with call number matches:

very slow very sparse report

WITH filtered_holdings AS (
    SELECT instance_id, call_number, call_number_prefix, call_number_suffix
    FROM folio_reporting.holdings_ext
    WHERE he.call_number IS NOT NULL AND he.call_number !~ '^\s*$' AND he.call_number_type_id IS NOT NULL
),
filtered_marctab AS (
    SELECT instance_hrid, content
    FROM public.srs_marctab
    WHERE field = '100' AND ord = 1 AND sf = 'a'
),
call_number AS (
  SELECT ie.instance_hrid, TRIM(CONCAT_WS(' ', he.call_number_prefix, he.call_number, he.call_number_suffix)) AS full_call_number, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN filtered_holdings he ON ie.instance_id = he.instance_id
  JOIN filtered_marctab sm ON ie.instance_hrid = sm.instance_hrid
),
call_number_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'call_number' AS match_type
  FROM call_number l
  JOIN call_number r ON l.full_call_number = r.full_call_number AND l.instance_hrid < r.instance_hrid
),
oclc_with_title AS (
  SELECT ie.instance_hrid, LTRIM(REGEXP_REPLACE(SUBSTRING(sm.content FROM 8), '[^0-9]', '', 'g'), '0') AS oclc, ie.title
  FROM folio_reporting.instance_ext ie
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE sm.field = '035' AND sm.ord = 1 AND sm.sf IN ('a', 'z') AND sm.content LIKE '(OCoLC)%'
),
oclc AS (
  SELECT oclcwt.instance_hrid, oclcwt.oclc, oclcwt.title, sm.content AS author
  FROM oclc_with_title oclcwt
  JOIN public.srs_marctab sm ON oclcwt.instance_hrid = sm.instance_hrid
  WHERE sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
oclc_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'oclc' AS match_type
  FROM oclc l
  JOIN oclc r ON l.oclc = r.oclc AND l.instance_hrid < r.instance_hrid
),
isbn AS (
  SELECT ie.instance_hrid, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(ii.identifier, ' .*', ''), 10), 9), ':') AS isbn, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.instance_identifiers ii ON ie.instance_hrid = ii.instance_hrid
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE ii.identifier_type_name = 'ISBN' AND ii.identifier NOT SIMILAR TO '(:|$)%'
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
isbn_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'isbn' AS match_type
  FROM isbn l
  JOIN isbn r ON l.isbn = r.isbn AND l.instance_hrid < r.instance_hrid
),
lccn AS (
  SELECT ie.instance_hrid, ii.identifier AS lccn, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.instance_identifiers ii ON ie.instance_hrid = ii.instance_hrid
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE ii.identifier_type_name = 'LCCN'
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
lccn_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'lccn' AS match_type
  FROM lccn l
  JOIN lccn r ON l.lccn = r.lccn  AND l.instance_hrid < r.instance_hrid
),
issn_with_title AS (
  SELECT ie.instance_hrid, sm.content AS issn, ie.title
  FROM folio_reporting.instance_ext ie
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE field = '022' AND ord = 1 AND sf = 'a'
),
issn AS (
  SELECT issnwt.instance_hrid, issnwt.issn, issnwt.title, sm.content AS author
  FROM issn_with_title issnwt
  JOIN public.srs_marctab sm ON issnwt.instance_hrid = sm.instance_hrid
  WHERE sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
),
issn_matches AS (
  SELECT l.instance_hrid AS hrid, r.instance_hrid AS hrid2, l.title AS title, r.title AS title2, l.author AS author, r.author AS author2, 'issn' AS match_type
  FROM issn l
  JOIN issn r ON l.issn = r.issn AND l.instance_hrid < r.instance_hrid
),
all_matches AS (
  SELECT hrid, hrid2, title, title2, author, author2, match_type FROM call_number_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM oclc_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM isbn_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM lccn_matches
  UNION ALL SELECT hrid, hrid2, title, title2, author, author2, match_type FROM issn_matches
)
SELECT
  hrid AS HRID,
  hrid2 AS HRID2,
  MAX(CASE WHEN match_type = 'oclc' THEN 'T' END) AS OCLC,
  MAX(CASE WHEN match_type = 'isbn' THEN 'T' END) AS ISBN,
  MAX(CASE WHEN match_type = 'issn' THEN 'T' END) AS ISSN,
  MAX(CASE WHEN match_type = 'call_number' THEN 'T' END) AS CALL_NUMBER,
  MAX(CASE WHEN match_type = 'lccn' THEN 'T' END) AS LCCN,
  '"' || REPLACE(title, '"', '""') || '"' AS TITLE,
  '"' || REPLACE(title2, '"', '""') || '"' AS TITLE2,
  '"' || REPLACE(author, '"', '""') || '"' AS AUTHOR,
  '"' || REPLACE(author2, '"', '""') || '"' AS AUTHOR2
FROM all_matches
GROUP BY hrid, hrid2, title, title2, author, author2;

Here are the individual identifier match queries:

Call Number

WITH call_number AS (
  SELECT ie.instance_hrid, he.call_number_type_id, he.call_number_type_name, he.call_number, TRIM(CONCAT_WS(' ', he.call_number_prefix, he.call_number, he.call_number_suffix)) AS full_call_number, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.holdings_ext he ON ie.instance_id = he.instance_id
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE he.call_number IS NOT NULL AND he.call_number !~ '^\s*$' AND he.call_number_type_id IS NOT NULL
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
)
SELECT
  l.instance_hrid AS hrid,
  r.instance_hrid AS hrid2,
  l.call_number AS call_number,
  r.call_number AS call_number2,
  l.full_call_number,
  '"' || REPLACE(l.title, '"', '""') || '"' AS title,
  '"' || REPLACE(r.title, '"', '""') || '"' AS title2,
  '"' || REPLACE(l.author, '"', '""') || '"' AS author,
  '"' || REPLACE(r.author, '"', '""') || '"' AS author2
FROM call_number l
JOIN call_number r ON l.full_call_number = r.full_call_number AND l.instance_hrid < r.instance_hrid;

ISBN

WITH isbn AS (
  SELECT ie.instance_hrid, ii.identifier, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(ii.identifier, ' .*', ''), 10), 9), ':') AS isbn, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.instance_identifiers ii ON ie.instance_hrid = ii.instance_hrid
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE ii.identifier_type_name = 'ISBN' AND ii.identifier NOT SIMILAR TO '(:|$)%'
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
)
SELECT
  l.instance_hrid AS hrid,
  r.instance_hrid AS hrid2,
  l.identifier,
  l.isbn,
  '"' || REPLACE(l.title, '"', '""') || '"' AS title,
  '"' || REPLACE(r.title, '"', '""') || '"' AS title2,
  '"' || REPLACE(l.author, '"', '""') || '"' AS author,
  '"' || REPLACE(r.author, '"', '""') || '"' AS author2
FROM isbn l
JOIN isbn r ON l.isbn = r.isbn AND l.instance_hrid < r.instance_hrid;

LCCN

WITH lccn AS (
  SELECT ie.instance_hrid, ii.identifier AS lccn, ie.title, sm.content AS author
  FROM folio_reporting.instance_ext ie
  JOIN folio_reporting.instance_identifiers ii ON ie.instance_hrid = ii.instance_hrid
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE ii.identifier_type_name = 'LCCN'
    AND sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
)
SELECT
  l.instance_hrid AS hrid,
  r.instance_hrid AS hrid2,
  l.lccn,
  '"' || REPLACE(l.title, '"', '""') || '"' AS title,
  '"' || REPLACE(r.title, '"', '""') || '"' AS title2,
  '"' || REPLACE(l.author, '"', '""') || '"' AS author,
  '"' || REPLACE(r.author, '"', '""') || '"' AS author2
FROM lccn l
JOIN lccn r ON l.lccn = r.lccn  AND l.instance_hrid < r.instance_hrid;

ISSN

WITH issn_with_title AS (
  SELECT ie.instance_hrid, sm.content AS issn, ie.title
  FROM folio_reporting.instance_ext ie
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE field = '022' AND ord = 1 AND sf = 'a'
),
issn AS (
  SELECT issnwt.instance_hrid, issnwt.issn, issnwt.title, sm.content AS author
  FROM issn_with_title issnwt
  JOIN public.srs_marctab sm ON issnwt.instance_hrid = sm.instance_hrid
  WHERE sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
)
SELECT
  l.instance_hrid AS hrid,
  r.instance_hrid AS hrid2,
  l.issn,
  '"' || REPLACE(l.title, '"', '""') || '"' AS title,
  '"' || REPLACE(r.title, '"', '""') || '"' AS title2,
  '"' || REPLACE(l.author, '"', '""') || '"' AS author,
  '"' || REPLACE(r.author, '"', '""') || '"' AS author2
FROM issn l
JOIN issn r ON l.issn = r.issn AND l.instance_hrid < r.instance_hrid;

OCLC

WITH oclc_with_title AS (
  SELECT ie.instance_hrid, LTRIM(REGEXP_REPLACE(SUBSTRING(sm.content FROM 8), '[^0-9]', '', 'g'), '0') AS oclc, ie.title
  FROM folio_reporting.instance_ext ie
  JOIN public.srs_marctab sm ON ie.instance_hrid = sm.instance_hrid
  WHERE sm.field = '035' AND sm.ord = 1 AND sm.sf IN ('a', 'z') AND sm.content LIKE '(OCoLC)%'
),
oclc AS (
  SELECT oclcwt.instance_hrid, oclcwt.oclc, oclcwt.title, sm.content AS author
  FROM oclc_with_title oclcwt
  JOIN public.srs_marctab sm ON oclcwt.instance_hrid = sm.instance_hrid
  WHERE sm.field = '100' AND sm.ord = 1 AND sm.sf = 'a'
)
SELECT
  l.instance_hrid AS hrid,
  r.instance_hrid AS hrid2,
  l.oclc,
  '"' || REPLACE(l.title, '"', '""') || '"' AS title,
  '"' || REPLACE(r.title, '"', '""') || '"' AS title2,
  '"' || REPLACE(l.author, '"', '""') || '"' AS author,
  '"' || REPLACE(r.author, '"', '""') || '"' AS author2
FROM oclc l
JOIN oclc r ON l.oclc = r.oclc AND l.instance_hrid < r.instance_hrid;

image