cldf / cookbook

Recipes for cooking with CLDF data
https://cldf.clld.org
Apache License 2.0
17 stars 7 forks source link

recipe for extracting word/gloss pairs in SQLite #23

Open xrotwang opened 8 months ago

xrotwang commented 8 months ago
SELECT word.cid, word.word, gloss.word
FROM (
    SELECT row_number() over (order by cid) as rn, word, cid
    FROM (
        WITH split(word, clause, cid) AS (
            SELECT '', f.cldf_analyzedWord || X'09', f.cldf_id
            FROM ExampleTable AS f
            UNION ALL
            SELECT
                substr(clause, 0, instr(clause, X'09')),
                substr(clause, instr(clause, X'09') + 1),
                cid
            FROM split WHERE clause != ''
        )
        SELECT word, cid FROM split where word != '')
) AS word,
(
    SELECT row_number() over (order by cid) as rn, word, cid
    FROM (
        WITH split(word, clause, cid) AS (
            SELECT '', f.cldf_gloss || X'09', f.cldf_id
            FROM ExampleTable AS f
            UNION ALL
            SELECT
                substr(clause, 0, instr(clause, X'09')),
                substr(clause, instr(clause, X'09') + 1),
                cid
            FROM split WHERE clause != ''
        )
        SELECT word, cid FROM split where word != '')
) AS gloss
WHERE word.rn = gloss.rn AND word.cid = gloss.cid
ORDER BY word.cid, word.rn;
xrotwang commented 8 months ago

Note: Since this recipe relies on correct word/gloss alignments for all examples, the relevant subset of examples may be filtered using the lgrConformance column.