Switch to a CLIR dataset with ir_datasets

cash commented 2 years ago

Right now we are using a small English dataset. Once the count bug is fixed with ir_datasets (https://github.com/allenai/ir_datasets/issues/123), we can swap the sample config to CLIRmatrix with a PSQ translation table.

Refs #6

eugene-yang commented 2 years ago

I don't think patapsco explicitly relies on ir_datasets to provide the document count. We can run everything without any exceptions. The current issue is the size of the collection is too big.

eugene-yang commented 2 years ago

Tested the following config and it works. But from indexing to evaluation takes about 40 min on Google Colab. This might be a bit too long for a demo.

config_irds = {
    "run": {
        "name": "irds test" 
    },
    "documents": {
        "input": {
            "format": "irds",
            "lang": "zho",
            "encoding": "utf8",
            "path": "clirmatrix/zh/bi139-base/en/dev",
        },
        "process": {
            "normalize": {
                "lowercase": True,
            },
            "tokenize": "jieba",
            "strict_check": True,
            "stopwords": "lucene"
        },
        "comment": "CLIRMatrix via ir-datasets zho/eng dev set", 
    },
    "database": {
        "name": "sqlite"
    },
    "index": {
        "name": "lucene"
    },
    "topics": {
        "input": {
            "format": "irds",
            "lang": "eng",
            "source": "original",
            "encoding": "utf8",
            "path": "clirmatrix/zh/bi139-base/en/dev"
        },
        "fields": "title"
    },
    # Query text preprocessing for PSQ
    "queries": {
        "output": "processed_queries",
        "parse": False,
        "process": {
            "normalize": {
                "lowercase": True,
                "report": False
            },
            "stem": False,
            "stopwords": "lucene",
            "strict_check": False,
            "tokenize": "moses"
        },
        "psq": {
            "lang": "eng",
            "normalize": {
                "lowercase": True,
                "report": False
            },
            "path": "zho_eng_clean_reduced_pdt.dict",
            "stem": False,
            "stopwords": "lucene",
            "threshold": 0.97
        }
    },
    "retrieve": {
        "name": "bm25",
        "number": 1000,
        "psq": True # Use PSQ
    },
    "score": {
        "input": {
            "format": "irds",
            "path": "clirmatrix/zh/bi139-base/en/dev"
        }
    }
}

hltcoe / patapsco

Switch to a CLIR dataset with ir_datasets #11