MichalTorma / pladias-dwc-export

MIT License
0 stars 0 forks source link

VIEW darwin_core_extended_occurrence #1

Open Jules- opened 8 months ago

Jules- commented 8 months ago

Okomentuji SQL

CREATE VIEW darwin_core_extended_occurrence AS
SELECT
    r.id AS occurrenceID,
-- VK: chybně t.scientific_name AS scientificName,
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    r.datum AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
-- VK: Co tady potřebujete? Záznam má více autorů. Je to vazba M : N. Předpokládám, že id z databáze Pladias Vám nepomůže nebo ano?
    ra.authors_id AS recordedByID,  -- Assuming recorder's ID maps to an author in the authors table
    r.source,
    r.environment,
-- VK: Zde Vám stačí id integer z tabulky atlas.projects?
    r.project_id AS associatedSequences, -- Example of mapping project ID to associated sequences; adjust based on actual data mapping
-- VK: Co by mělo být zde? Jedná se o id z tabulky atlas.record_validation_status
    r.validation_status AS identificationVerificationStatus,
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision,
-- VK: možné hodnoty Y, M nebo D, které určují, jestli je z pole datum platný pouze rok nebo i měsíc nebo celé datum.
    r.datum_precision AS verbatimEventDate,
-- VK: duplicitní s r.locality
    r.nearest_town_text AS locality,
-- VK: jedná se o integer id z tabulky fytochorionů (geodata.phytochorions)
    r.phytochorion_id AS locationID -- Example of using phytochorion_id as a location ID
FROM
    atlas.records AS r
LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
LEFT JOIN atlas.records_authors AS ra ON r.id = ra.records_id
WHERE
-- VK: U tohoto příznaku je to komplikovanější, protože v určitých případech chtěli v českém Pladiasu, aby bylo možné vložit do mapy i odmítnuté záznamy. Možná by bylo vhodnější vybírat podle validation status. 
    r.include_in_map = TRUE;

Podle https://dwc.tdwg.org/terms/#dwc:recordedByID by to mohlo být možná takto:

SELECT
    r.id AS occurrenceID,
    string_agg(cast(ra.authors_id as varchar), '|') AS recordedByID
FROM
    atlas.records AS r
        LEFT JOIN atlas.records_authors AS ra ON r.id = ra.records_id
WHERE
    r.include_in_map = TRUE
GROUP BY r.id
MichalTorma commented 8 months ago

Prepacte za neskoru odpoved - minuly tyzden som toho mal vela. Tu to mam s upravami a komentarmi:

SELECT
-- MT: Tu je otazka ci je tento identifikator dostatocne stabilny aby sme ho puzivali. standarne pouzivame uuid alebo triplet institutionCode:collectionCode:catalogNumber
    CONCAT(p.institution_id, ':', p.abbrev, ':', r.id) AS occurrenceID,
-- VK: chybně t.scientific_name AS scientificName,
-- MT: pravda - netusim odkial som nabral scientific_name :)
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    r.datum AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
-- VK: Co tady potřebujete? Záznam má více autorů. Je to vazba M : N. Předpokládám, že id z databáze Pladias Vám nepomůže nebo ano?
-- MT: Mate pravdu - to id nepotrebujem - skoda ze nepouzivate ORCID. Mohlo byt is ist takto? :
    string_agg(CONCAT(ra.name, ' ', ra.surname), '|') AS recordedBy
    r.source,
    r.environment,
-- VK: Zde Vám stačí id integer z tabulky atlas.projects?
-- MT: Toto mozeme vypustit - to bola GPT halucinacia
    -- r.project_id AS associatedSequences, -- Example of mapping project ID to associated sequences; adjust based on actual data mapping
-- VK: Co by mělo být zde? Jedná se o id z tabulky atlas.record_validation_status
-- MT: tu neviem ake su moznosti - v priklade co ste mi poslal mali vsetky hodnotu 3. viete mi o tom povedat viac?
    r.validation_status AS identificationVerificationStatus,
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision,
-- VK: možné hodnoty Y, M nebo D, které určují, jestli je z pole datum platný pouze rok nebo i měsíc nebo celé datum.
-- I see takze r.datum je vzdy ISO date ale podla tohoto stlpca treba ignorovat mesiac/den ?
    -- r.datum_precision AS verbatimEventDate,
-- VK: duplicitní s r.locality
    -- r.locality AS locality,
-- VK: jedná se o integer id z tabulky fytochorionů (geodata.phytochorions)
-- MT: Kedze sa nechystame publikovat aj tabulku fytochorionov, tak mozeme tento stlpec zhodit tiez
    -- r.phytochorion_id AS locationID -- Example of using phytochorion_id as a location ID
FROM
    atlas.records AS r
LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
LEFT JOIN atlas.records_authors AS ra ON r.id = ra.records_id
LEFT JOIN public.projects AS p ON r.records_id = p.id
WHERE
-- VK: U tohoto příznaku je to komplikovanější, protože v určitých případech chtěli v českém Pladiasu, aby bylo možné vložit do mapy i odmítnuté záznamy. Možná by bylo vhodnější vybírat podle validation status. 
    r.include_in_map = TRUE;
-- pridame toto nech vieme robit agregaciu
GROUP BY r.id
Jules- commented 8 months ago

Upravil jsem:

SELECT
-- MT: Tu je otazka ci je tento identifikator dostatocne stabilny aby sme ho puzivali. standarne pouzivame uuid alebo triplet institutionCode:collectionCode:catalogNumber
-- VK2: Myslím, že by se to nemělo brát z projektu. Projekt je pro nás například databáze ze které jsou záznamy importovány.
--      Dával by mi smysl konstantní prefix pro identifikaci instituce. Např. BU-SAV.
CONCAT(p.institution_id, ':', p.abbrev, ':', r.id) AS occurrenceID_wrong,
CONCAT('BU-SAV:', r.id) AS occurrenceID,
-- VK: chybně t.scientific_name AS scientificName,
-- MT: pravda - netusim odkial som nabral scientific_name :)
t.name_lat AS scientificName,
r.latitude AS decimalLatitude,
r.longitude AS decimalLongitude,
r.datum AS eventDate,
r.locality,
r.altitude_min AS minimumElevationInMeters,
r.altitude_max AS maximumElevationInMeters,
r.comment AS occurrenceRemarks,
-- VK: Co tady potřebujete? Záznam má více autorů. Je to vazba M : N. Předpokládám, že id z databáze Pladias Vám nepomůže nebo ano?
-- MT: Mate pravdu - to id nepotrebujem - skoda ze nepouzivate ORCID. Mohlo byt is ist takto? :
-- VK2: zkoušel jsem to a zdá se, že subquery je výrazně efektivnější
(
    SELECT string_agg(CASE WHEN a.name = '' THEN a.surname ELSE CONCAT(a.name, ' ', a.surname) END, '|')
    FROM atlas.records_authors
    INNER JOIN atlas.authors AS a ON records_authors.authors_id = a.id
    WHERE records_id = r.id) AS recordedBy,
r.source,
r.environment,
-- VK: Zde Vám stačí id integer z tabulky atlas.projects?
-- MT: Toto mozeme vypustit - to bola GPT halucinacia
-- r.project_id AS associatedSequences, -- Example of mapping project ID to associated sequences; adjust based on actual data mapping
-- VK: Co by mělo být zde? Jedná se o id z tabulky atlas.record_validation_status
-- MT: tu neviem ake su moznosti - v priklade co ste mi poslal mali vsetky hodnotu 3. viete mi o tom povedat viac?
r.validation_status AS identificationVerificationStatus,
r.original_name AS verbatimScientificName,
r.gps_coords_precision AS coordinatePrecision
-- VK: možné hodnoty Y, M nebo D, které určují, jestli je z pole datum platný pouze rok nebo i měsíc nebo celé datum.
-- I see takze r.datum je vzdy ISO date ale podla tohoto stlpca treba ignorovat mesiac/den ?
-- r.datum_precision AS verbatimEventDate,
-- VK: duplicitní s r.locality
-- r.locality AS locality,
-- VK: jedná se o integer id z tabulky fytochorionů (geodata.phytochorions)
-- MT: Kedze sa nechystame publikovat aj tabulku fytochorionov, tak mozeme tento stlpec zhodit tiez
-- r.phytochorion_id AS locationID -- Example of using phytochorion_id as a location ID
FROM
    atlas.records AS r
        LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
        LEFT JOIN atlas.projects AS p ON r.project_id = p.id
WHERE
-- VK: U tohoto příznaku je to komplikovanější, protože v určitých případech chtěli v českém Pladiasu, aby bylo možné vložit do mapy i odmítnuté záznamy. Možná by bylo vhodnější vybírat podle validation status.
r.include_in_map = TRUE
LIMIT 1000;
MichalTorma commented 8 months ago

Dakujem za promptnu odpoved. myslim ze nam ostava len zopar detailov tak ich tu rozpisem:

Jules- commented 8 months ago
MichalTorma commented 8 months ago

Tak som to precistil a implementoval komentare. vyzera to rozumne? Trebalo by skusit ktore je rychleisie na celom datasete. subquerry:

SELECT
    CONCAT('BU-SAV:PLADIAS:', r.id) AS occurrenceID,
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    CASE
        WHEN r.datum_precision = 'Y' THEN TO_CHAR(r.datum, 'YYYY')
        WHEN r.datum_precision = 'M' THEN TO_CHAR(r.datum, 'YYYY-MM')
        WHEN r.datum_precision = 'D' THEN TO_CHAR(r.datum, 'YYYY-MM-DD')
        ELSE NULL
    END AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
    (
        SELECT string_agg(CASE WHEN a.name = '' THEN a.surname ELSE CONCAT(a.name, ' ', a.surname) END, '|')
        FROM atlas.records_authors
        INNER JOIN atlas.authors AS a ON records_authors.authors_id = a.id
        WHERE records_id = r.id) AS recordedBy,
    r.source,
    r.environment,
    CASE
        WHEN r.validation_status IN (0, 1) THEN 'verification required'
        WHEN r.validation_status = 3 THEN 'verified'
        ELSE NULL
    END AS identificationVerificationStatus
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision
FROM
    atlas.records AS r
        LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
        LEFT JOIN atlas.projects AS p ON r.project_id = p.id
WHERE
    r.validation_status != 2

alebo aggregate

SELECT
    CONCAT('BU-SAV:PLADIAS:', r.id) AS occurrenceID,
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    CASE
        WHEN r.datum_precision = 'Y' THEN TO_CHAR(r.datum, 'YYYY')
        WHEN r.datum_precision = 'M' THEN TO_CHAR(r.datum, 'YYYY-MM')
        WHEN r.datum_precision = 'D' THEN TO_CHAR(r.datum, 'YYYY-MM-DD')
        ELSE NULL
    END AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
    string_agg(CONCAT(ra.name, ' ', ra.surname), '|') AS recordedBy
    r.source,
    r.environment,
    CASE
        WHEN r.validation_status IN (0, 1) THEN 'verification required'
        WHEN r.validation_status = 3 THEN 'verified'
        ELSE NULL
    END AS identificationVerificationStatus
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision
FROM
    atlas.records AS r
        LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
        LEFT JOIN atlas.projects AS p ON r.project_id = p.id
WHERE
    r.validation_status != 2
GROUP BY r.id
MichalTorma commented 8 months ago

Este som si vsimol, r.gps_coords_precision je v metroch alebo v stupnoch?

Jules- commented 8 months ago

Ta r.gps_coords_precision je v metrech.

Upravil jsem SQL, aby se daly spustit. Otestoval jsem export na 14 mil. záznamech z českého Pladiasu. Dopadlo to lépe pro subquery (3:26) oproti group by(12:20). Možná je problém, že group by seskupuje podle všech sloupců včetně double latitude a longitude. Pro úplnost jsem ještě vymyslel jeden přístup přes join(4:19). Ten je o něco horší než subquery.

subquery

SELECT
    CONCAT('BU-SAV:PLADIAS:', r.id) AS occurrenceID,
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    CASE
        WHEN r.datum_precision = 'Y' THEN TO_CHAR(r.datum, 'YYYY')
        WHEN r.datum_precision = 'M' THEN TO_CHAR(r.datum, 'YYYY-MM')
        WHEN r.datum_precision = 'D' THEN TO_CHAR(r.datum, 'YYYY-MM-DD')
        ELSE NULL
        END AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
    (
        SELECT string_agg(CASE WHEN a.name = '' THEN a.surname ELSE CONCAT(a.name, ' ', a.surname) END, '|')
        FROM atlas.records_authors
                 INNER JOIN atlas.authors AS a ON records_authors.authors_id = a.id
        WHERE records_id = r.id) AS recordedBy,
    r.source,
    r.environment,
    CASE
        WHEN r.validation_status IN (0, 1) THEN 'verification required'
        WHEN r.validation_status = 3 THEN 'verified'
        ELSE NULL
        END AS identificationVerificationStatus,
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision
FROM
    atlas.records AS r
        LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
        LEFT JOIN atlas.projects AS p ON r.project_id = p.id
WHERE
    r.validation_status != 2

group by

SELECT
    CONCAT('BU-SAV:PLADIAS:', r.id) AS occurrenceID,
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    CASE
        WHEN r.datum_precision = 'Y' THEN TO_CHAR(r.datum, 'YYYY')
        WHEN r.datum_precision = 'M' THEN TO_CHAR(r.datum, 'YYYY-MM')
        WHEN r.datum_precision = 'D' THEN TO_CHAR(r.datum, 'YYYY-MM-DD')
        ELSE NULL
        END AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
    string_agg(CASE WHEN a.name = '' THEN a.surname ELSE CONCAT(a.name, ' ', a.surname) END, '|') AS recordedBy,
    r.source,
    r.environment,
    CASE
        WHEN r.validation_status IN (0, 1) THEN 'verification required'
        WHEN r.validation_status = 3 THEN 'verified'
        ELSE NULL
        END AS identificationVerificationStatus,
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision
FROM
    atlas.records AS r
        LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
        LEFT JOIN atlas.projects AS p ON r.project_id = p.id
        LEFT JOIN atlas.records_authors AS ra ON ra.records_id = r.id
        LEFT JOIN atlas.authors AS a ON ra.authors_id = a.id
WHERE
    r.validation_status != 2
GROUP BY r.id, t.name_lat, r.latitude, r.longitude, r.datum_precision, r.locality, r.altitude_min,
         r.altitude_max, r.comment, r.source, r.environment, r.validation_status,
         r.original_name, r.gps_coords_precision;

join

SELECT
    CONCAT('BU-SAV:PLADIAS:', r.id) AS occurrenceID,
    t.name_lat AS scientificName,
    r.latitude AS decimalLatitude,
    r.longitude AS decimalLongitude,
    CASE
        WHEN r.datum_precision = 'Y' THEN TO_CHAR(r.datum, 'YYYY')
        WHEN r.datum_precision = 'M' THEN TO_CHAR(r.datum, 'YYYY-MM')
        WHEN r.datum_precision = 'D' THEN TO_CHAR(r.datum, 'YYYY-MM-DD')
        ELSE NULL
        END AS eventDate,
    r.locality,
    r.altitude_min AS minimumElevationInMeters,
    r.altitude_max AS maximumElevationInMeters,
    r.comment AS occurrenceRemarks,
    a.names AS recordedBy,
    r.source,
    r.environment,
    CASE
        WHEN r.validation_status IN (0, 1) THEN 'verification required'
        WHEN r.validation_status = 3 THEN 'verified'
        ELSE NULL
        END AS identificationVerificationStatus,
    r.original_name AS verbatimScientificName,
    r.gps_coords_precision AS coordinatePrecision
FROM
    atlas.records AS r
        LEFT JOIN public.taxons AS t ON r.taxon_id = t.id
        LEFT JOIN atlas.projects AS p ON r.project_id = p.id
        LEFT JOIN (
            SELECT records_id, string_agg(CASE WHEN a.name = '' THEN a.surname ELSE CONCAT(a.name, ' ', a.surname) END, '|') AS names
            FROM atlas.records_authors
                INNER JOIN atlas.authors AS a ON records_authors.authors_id = a.id
            GROUP BY records_id) AS a ON r.id = a.records_id
WHERE
    r.validation_status != 2
;