Lemmatization performance on Universal Dependency Treebanks

I am comparing the performance of the most popular lemmatization tools. I have found benchmark results for Stanza, Trankit, and spaCy on Universal Dependencies version 2.5. However, I couldn't find anything related to Spark NLP. Could you please point me to it if such a benchmark has already been done?

I have tried to do it myself, and I got an aligned accuracy of ~78% (I am attaching the code and results below). Questions:

Am I using the best available models?
What do you think about the correctness of the evaluation of the lemmatization performance?

Appreciate your input.

import os
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from sparknlp import Finisher
import sparknlp

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import pyconll
from conll18_ud_eval import load_conllu, evaluate
###
spark = sparknlp.start(spark32=True)
###
def test_lang(lang_code, golden_test_file, model_tag="lemma"):
    data = pyconll.load_from_file(golden_test_file)
    df = spark.createDataFrame([[sentence.id, sentence.text] for sentence in data]).toDF("id", "text")
    if "lemma" in model_tag:
        document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document").setCleanupMode("shrink")
        sentence = SentenceDetector().setInputCols("document").setOutputCol("sentence")
        tokenizer = Tokenizer().setInputCols("sentence").setOutputCol("token")
        lemmatizer = LemmatizerModel.pretrained(model_tag, lang_code).setInputCols(["token"]).setOutputCol("lemma")
        pipeline = Pipeline().setStages([document_assembler, sentence, tokenizer, lemmatizer])
        pipeline_executer = pipeline.fit(spark.createDataFrame([["id", "dummy test"]]).toDF("id", "text"))
        lemmatized_df = pipeline_executer.transform(df)
    elif "wordseg" in model_tag:
        document_assembler = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

        word_segmenter = WordSegmenterModel.pretrained(model_tag, lang_code)\
                .setInputCols("document")\
                .setOutputCol("token")

        lemmatizer = LemmatizerModel.pretrained("lemma", lang_code) \
                .setInputCols(["token"]) \
                .setOutputCol("lemma")

        pipeline = Pipeline(stages=[document_assembler, word_segmenter , lemmatizer])
        pipeline_executer = pipeline.fit(spark.createDataFrame([["id", "dummy test"]]).toDF("id", "text"))
        lemmatized_df = pipeline_executer.transform(df)
    else:
        #finisher = Finisher().setInputCols(["token", "lemma"])
        explain_pipeline_model = PretrainedPipeline(model_tag, lang=lang_code).model
        pipeline = Pipeline() \
            .setStages([
                explain_pipeline_model,
                #finisher
                ])
        model = pipeline.fit(spark.createDataFrame([["id", "dummy test"]]).toDF("id", "text"))
        lemmatized_df = model.transform(df)

    lemmas = lemmatized_df.rdd.map(lambda x: (x.token, x.lemma)).collect()
    ##
    with open(f"sys_test_{lang_code}_sparknlp.conllu", "w") as f:
        for (tokenized_sen, lemmatized_sen), orig_sen in zip(lemmas, data):
            f.write(f"# sent_id = {orig_sen.id}\n")
            f.write(f"# text = {orig_sen.text}\n")
            for i, (token, lemma) in enumerate(zip(tokenized_sen, lemmatized_sen)):
                f.write(f"{i+1}\t{token.result}\t{lemma.result}\tNOUN\t_\t_\t{i}\t_\t_\t_\n")
            f.write("\n")
###
corpora_and_model_names = [("UD_Afrikaans-AfriBooms", "lemma"),
("UD_Ancient_Greek-PROIEL", "lemma_spacylookup"),
("UD_Ancient_Greek-Perseus", "lemma_spacylookup"),
("UD_Arabic-PADT", "lemma"),
("UD_Armenian-ArmTDP", "lemma"),
("UD_Basque-BDT", "lemma"),
#("UD_Belarusian-HSE", ""),
("UD_Bulgarian-BTB", "lemma"),
("UD_Catalan-AnCora", "lemma"),
("UD_Chinese-GSD", "lemma"),
("UD_Classical_Chinese-Kyoto", "lemma"),
("UD_Croatian-SET", "lemma"),
("UD_Czech-CAC", "lemma"),
("UD_Czech-CLTT", "lemma"),
("UD_Czech-FicTree", "lemma"),
("UD_Czech-PDT", "lemma"),
("UD_Danish-DDT", "explain_document_md"),
("UD_Dutch-Alpino", "explain_document_md"),
("UD_Dutch-LassySmall", "explain_document_md"),
("UD_English-EWT", "explain_document_dl"),
("UD_English-GUM", "explain_document_dl"),
("UD_English-LinES", "explain_document_dl"),
("UD_English-ParTUT", "explain_document_dl"),
("UD_Estonian-EDT", "lemma"),
("UD_Estonian-EWT", "lemma"),
("UD_Finnish-FTB", "explain_document_lg"),
("UD_Finnish-TDT", "explain_document_lg"),
("UD_French-GSD", "explain_document_md"),
("UD_French-ParTUT", "explain_document_md"),
("UD_French-Sequoia", "explain_document_md"),
("UD_French-Spoken", "explain_document_md"),
("UD_Galician-CTG", "lemma"),
("UD_Galician-TreeGal", "lemma"),
("UD_German-GSD", "explain_document_md"),
("UD_German-HDT", "explain_document_md"),
("UD_Greek-GDT", "lemma"),
("UD_Hebrew-HTB", "lemma"),
("UD_Hindi-HDTB", "lemma"),
("UD_Hungarian-Szeged", "lemma"),
("UD_Indonesian-GSD", "lemma"),
("UD_Irish-IDT", "lemma"),
("UD_Italian-ISDT", "explain_document_md"),
("UD_Italian-ParTUT", "explain_document_md"),
("UD_Italian-PoSTWITA", "explain_document_md"),
("UD_Italian-TWITTIRO", "explain_document_md"),
("UD_Italian-VIT", "explain_document_md"),
("UD_Japanese-GSD", "wordseg_gsd_ud"),
#("UD_Kazakh-KTB", ""),
("UD_Korean-GSD", "wordseg_kaist_ud"),
("UD_Korean-Kaist", "wordseg_kaist_ud"),
#("UD_Kurmanji-MG", ""),
("UD_Latin-ITTB", "lemma"),
("UD_Latin-PROIEL", "lemma"),
("UD_Latin-Perseus", "lemma"),
("UD_Latvian-LVTB", "lemma"),
("UD_Lithuanian-ALKSNIS", "lemma_spacylookup"),
("UD_Lithuanian-HSE", "lemma_spacylookup"),
("UD_Marathi-UFAL", "lemma"),
("UD_Norwegian-Bokmaal", "explain_document_md"),
("UD_Norwegian-Nynorsk", "entity_recognizer_md"),
("UD_Norwegian-NynorskLIA", "entity_recognizer_md"),
#("UD_Old_French-SRCMF", ""),
#("UD_Old_Russian-TOROT", ""),
("UD_Persian-Seraji", "lemma"),
("UD_Polish-LFG", "explain_document_md"),
("UD_Polish-PDB", "explain_document_md"),
("UD_Portuguese-Bosque", "explain_document_md"),
("UD_Portuguese-GSD", "explain_document_md"),
("UD_Romanian-Nonstandard", "lemma"),
("UD_Romanian-RRT", "lemma"),
("UD_Russian-GSD", "explain_document_md"),
("UD_Russian-SynTagRus", "explain_document_md"),
("UD_Russian-Taiga", "explain_document_md"),
#("UD_Scottish_Gaelic-ARCOSG", ""),
("UD_Serbian-SET", "lemma_spacylookup"),
#("UD_Simplified_Chinese-GSDSimp", "lemma"),
("UD_Slovak-SNK", "lemma"),
("UD_Slovenian-SSJ", "lemma"),
("UD_Slovenian-SST", "lemma"),
("UD_Spanish-AnCora", "explain_document_md"),
("UD_Spanish-GSD", "explain_document_md"),
("UD_Swedish-LinES", "explain_document_md"),
("UD_Swedish-Talbanken", "explain_document_md"),
("UD_Tamil-TTB", "lemma"),
#("UD_Telugu-MTG", ""),
("UD_Turkish-IMST", "lemma"),
("UD_Ukrainian-IU", "lemma"),
("UD_Urdu-UDTB", "lemma"),
#("UD_Uyghur-UDT", ""),
("UD_Vietnamese-VTB", "lemma"),]
###
resutls = []
failed_tests = []
base_dir = "Universal Dependencies 2.5/ud-treebanks-v2.5"
for corpus_name, model_tag in tqdm(corpora_and_model_names):
    corpus_dir = os.path.join(base_dir, corpus_name)
    for fname in os.listdir(corpus_dir):
        if fname.endswith("test.conllu"):
            lang_code = fname.split("_")[0]
            test_corpus_path = os.path.join(corpus_dir, fname)
            break
    print(f"Current corpus={corpus_name}, language_code={lang_code}")
    try:
        test_lang(lang_code, test_corpus_path, model_tag)
        sys_corpus_name = f"sys_test_{lang_code}_sparknlp.conllu"
        with open(test_corpus_path) as f:
            gold_ud = load_conllu(f)
        with open(sys_corpus_name) as f:
            system_ud = load_conllu(f)
        res = evaluate(gold_ud, system_ud)["Lemmas"]
        resutls.append((corpus_name, lang_code, res.precision, res.recall, res.f1, res.aligned_accuracy))
        print(resutls[-1])
        print("Finished successfully 😏")
    except Exception as e:
        print("Failed testing 😭")
        print(e)
        failed_tests.append((corpus_name, lang_code))
    finally:
        print("-"*60)

	UD_Corpus_Title	Language Code	Spark-NLP Model	Precision	Recall	F1	Aligned Accuracy
0	UD_Afrikaans-AfriBooms	af	lemma	0.902744	0.91853	0.910568	0.937152
1	UD_Ancient_Greek-PROIEL	grc	lemma_spacylookup	0.891693	0.891693	0.891693	0.891693
2	UD_Ancient_Greek-Perseus	grc	lemma_spacylookup	0.733116	0.71783	0.725392	0.749738
3	UD_Arabic-PADT	ar	lemma	0.581066	0.476012	0.523319	0.727243
4	UD_Armenian-ArmTDP	hy	lemma	0.595522	0.55091	0.572348	0.697891
5	UD_Basque-BDT	eu	lemma	0.791485	0.788627	0.790053	0.797329
6	UD_Bulgarian-BTB	bg	lemma	0.67109	0.672157	0.671623	0.683503
7	UD_Catalan-AnCora	ca	lemma	0.758982	0.721322	0.739673	0.802467
8	UD_Chinese-GSD	zh	lemma	0.0132325	0.000582751	0.00111634	1
9	UD_Czech-CAC	cs	lemma	0.733771	0.730528	0.732146	0.734586
10	UD_Czech-CLTT	cs	lemma	0.688137	0.72175	0.704543	0.755918
11	UD_Czech-FicTree	cs	lemma	0.746694	0.733553	0.740065	0.758339
12	UD_Czech-PDT	cs	lemma	0.757395	0.746024	0.751666	0.764454
13	UD_Danish-DDT	da	explain_document_md	0.711909	0.614886	0.65985	0.83976
14	UD_Dutch-Alpino	nl	explain_document_md	0.787347	0.710936	0.747193	0.878805
15	UD_Dutch-LassySmall	nl	explain_document_md	0.748243	0.654292	0.698121	0.85784
16	UD_English-EWT	en	explain_document_dl	0.846399	0.824713	0.835415	0.883129
17	UD_English-GUM	en	explain_document_dl	0.879155	0.865298	0.872171	0.897596
18	UD_English-LinES	en	explain_document_dl	0.886227	0.877001	0.88159	0.903164
19	UD_English-ParTUT	en	explain_document_dl	0.844113	0.830986	0.837498	0.853526
20	UD_Estonian-EDT	et	lemma	0.904108	0.903735	0.903921	0.918683
21	UD_Estonian-EWT	et	lemma	0.775	0.774385	0.774693	0.792935
22	UD_Finnish-FTB	fi	explain_document_lg	0.677279	0.676655	0.676967	0.678948
23	UD_Finnish-TDT	fi	explain_document_lg	0.590387	0.507167	0.545622	0.706746
24	UD_French-GSD	fr	explain_document_md	0.635399	0.519912	0.571883	0.79954
25	UD_French-ParTUT	fr	explain_document_md	0.625686	0.525932	0.571488	0.758029
26	UD_French-Sequoia	fr	explain_document_md	0.631503	0.519506	0.570056	0.791509
27	UD_French-Spoken	fr	explain_document_md	0.785086	0.774753	0.779886	0.796188
28	UD_Galician-CTG	gl	lemma	0.668232	0.602731	0.633794	0.752414
29	UD_Galician-TreeGal	gl	lemma	0.729266	0.66782	0.697192	0.807582
30	UD_German-GSD	de	explain_document_md	0.668825	0.563583	0.611711	0.806768
31	UD_German-HDT	de	explain_document_md	0.776414	0.777704	0.777059	0.778939
32	UD_Greek-GDT	el	lemma	0.809812	0.790386	0.799981	0.843838
33	UD_Hebrew-HTB	he	lemma	0.629953	0.434875	0.514544	0.964608
34	UD_Hindi-HDTB	hi	lemma	0.783361	0.787186	0.785269	0.790779
35	UD_Hungarian-Szeged	hu	lemma	0.765082	0.767132	0.766106	0.775745
36	UD_Indonesian-GSD	id	lemma	0.899462	0.894652	0.897051	0.906971
37	UD_Irish-IDT	ga	lemma	0.80888	0.808641	0.80876	0.830345
38	UD_Italian-ISDT	it	explain_document_md	0.6046	0.479409	0.534775	0.782146
39	UD_Italian-ParTUT	it	explain_document_md	0.621786	0.504945	0.557307	0.782128
40	UD_Italian-PoSTWITA	it	explain_document_md	0.605047	0.529918	0.564996	0.761716
41	UD_Italian-TWITTIRO	it	explain_document_md	0.5921	0.493616	0.538391	0.758781
42	UD_Italian-VIT	it	explain_document_md	0.619659	0.492226	0.54864	0.804064
43	UD_Latin-ITTB	la	lemma	0.647334	0.648918	0.648125	0.650706
44	UD_Latin-PROIEL	la	lemma	0.545493	0.545455	0.545474	0.546113
45	UD_Latin-Perseus	la	lemma	0.504245	0.504199	0.504222	0.504291
46	UD_Latvian-LVTB	lv	lemma	0.709491	0.702341	0.705898	0.725001
47	UD_Lithuanian-ALKSNIS	lt	lemma_spacylookup	0.693599	0.689379	0.691482	0.706444
48	UD_Lithuanian-HSE	lt	lemma_spacylookup	0.665678	0.634906	0.649928	0.695967
49	UD_Marathi-UFAL	mr	lemma	0.663014	0.587379	0.622909	0.761006
50	UD_Norwegian-Bokmaal	no	explain_document_md	0.815898	0.816225	0.816062	0.816389
51	UD_Persian-Seraji	fa	lemma	0.769021	0.738018	0.7532	0.80198
52	UD_Polish-LFG	pl	explain_document_md	0.628635	0.514338	0.565772	0.801998
53	UD_Polish-PDB	pl	explain_document_md	0.624492	0.525329	0.570635	0.75794
54	UD_Portuguese-Bosque	pt	explain_document_md	0.614403	0.49098	0.545801	0.80424
55	UD_Portuguese-GSD	pt	explain_document_md	0.721837	0.588773	0.64855	0.919157
56	UD_Romanian-Nonstandard	ro	lemma	0.580626	0.567235	0.573853	0.60645
57	UD_Romanian-RRT	ro	lemma	0.780046	0.767765	0.773857	0.800166
58	UD_Russian-GSD	ru	explain_document_md	0.603459	0.511814	0.553871	0.738998
59	UD_Russian-SynTagRus	ru	explain_document_md	0.659679	0.544452	0.596552	0.811278
60	UD_Russian-Taiga	ru	explain_document_md	0.596722	0.488481	0.537203	0.768819
61	UD_Serbian-SET	sr	lemma_spacylookup	0.449162	0.448297	0.448729	0.461552
62	UD_Slovak-SNK	sk	lemma	0.745795	0.742017	0.743902	0.749264
63	UD_Slovenian-SSJ	sl	lemma	0.773393	0.768559	0.770968	0.779973
64	UD_Slovenian-SST	sl	lemma	0.826261	0.826261	0.826261	0.826261
65	UD_Spanish-AnCora	es	explain_document_md	0.673841	0.594979	0.631959	0.767172
66	UD_Spanish-GSD	es	explain_document_md	0.584942	0.51275	0.546472	0.675115
67	UD_Swedish-LinES	sv	explain_document_md	0.626652	0.560904	0.591958	0.713313
68	UD_Swedish-Talbanken	sv	explain_document_md	0.692869	0.629386	0.659603	0.776332
69	UD_Tamil-TTB	ta	lemma	0.632361	0.577677	0.603783	0.738907
70	UD_Turkish-IMST	tr	lemma	0.799129	0.786619	0.792824	0.834109
71	UD_Ukrainian-IU	uk	lemma	0.792608	0.764314	0.778204	0.81584
72	UD_Urdu-UDTB	ur	lemma	0.526349	0.504593	0.515241	0.550066
73	UD_Vietnamese-VTB	vi	lemma	0.653358	0.788457	0.714578	1

JohnSnowLabs / spark-nlp

Lemmatization performance on Universal Dependency Treebanks #7163