Thanks for sharing your codes!
I'm trying to test my tokenizers with your codes.
In tokenizer_exploration_utils.py, it seems like you're calculating the same fertility twice in plot_fertility(). And I can find the same part inplot_proportion_continuation(), plot_proportion_unks().
In plot_fertility(), line 332-340 and 342-353 look like they're doing the same calculation as getting language, fertility, and model in a dataframe.
Also, the function only returns the second dataframe. Is there any difference between the two parts?
Thanks :)
def plot_fertility(language_ud_dicts):
sns.set(style="whitegrid")
width = 512.14963
sns.set(
rc={
"axes.spines.bottom": True,
"axes.spines.left": True,
"axes.spines.right": False,
"axes.spines.top": False,
"font.size": 12,
"axes.labelsize": 12,
"axes.grid": False,
"legend.fontsize": 10,
"ytick.left": True,
"xtick.major.size": 8,
"ytick.major.size": 8,
"pgf.texsystem": "lualatex",
"text.latex.preamble": r"\usepackage{xcolor}",
"text.usetex": True,
},
style="whitegrid",
)
colors = ["indianred", "skyblue", "dodgerblue", "royalblue", "navy"]
sns.set_palette(sns.color_palette(colors))
sns.set_context("notebook") # use notebook or talk
titles = ["Mono", "mBERT"]
for i, language_ud_dict in enumerate(language_ud_dicts):
languages = []
values = []
for k, v in language_ud_dict.items():
languages.append(r"\textsc{%s}" % k)
values.append(np.mean(v["split_lengths"]))
d = {"languages": languages, "fertility": values}
df = pd.DataFrame(data=d).sort_values(ascending=True, by="fertility")
d = {"Language": [], "Fertility": [], "Model": []}
for i, language_ud_dict in enumerate(language_ud_dicts):
languages = []
values = []
for k, v in language_ud_dict.items():
languages.append(r"\textsc{%s}" % k)
values.append(np.mean(v["split_lengths"]))
d["Language"] += languages
d["Fertility"] += values
d["Model"] += [titles[i] for _ in values]
df = pd.DataFrame(data=d).sort_values(ascending=True, by="Language")
ax2 = sns.catplot(
kind="bar", x="Language", y="Fertility", hue="Model", data=df, legend=False, height=5, aspect=2.1
)
ax2.set_xlabels("")
ax2.set_ylabels(fontsize=30)
ax2.set_xticklabels(fontsize=30)
ax2.set(yticks=[0.0, 0.5, 1.0, 1.5, 2.0])
ax2.set_yticklabels([0.0, 0.5, 1.0, 1.5, 2.0], fontsize=28)
ax2.savefig("fertility.pdf", bbox_inches="tight")
return df
Thanks for sharing your codes! I'm trying to test my tokenizers with your codes.
In tokenizer_exploration_utils.py, it seems like you're calculating the same fertility twice in
plot_fertility()
. And I can find the same part inplot_proportion_continuation()
,plot_proportion_unks()
.In
plot_fertility()
, line 332-340 and 342-353 look like they're doing the same calculation as getting language, fertility, and model in a dataframe.Also, the function only returns the second dataframe. Is there any difference between the two parts? Thanks :)