Currently, Farsi Numbers and special characters are not supported. Here is a PR to add support for them.
Also, These words are extract from wikipedia corpus and then processed with a code like this. So, this should be better suited for training purposes.
from pathlib import Path
my_file=Path('fawiki_cleaned_lines.txt').read_text()
lines=my_file.splitlines()
import re
def contains_more_than_2_non_persian(text):
# Define a regular expression pattern to match non-Persian characters
non_persian_pattern = re.compile(r'[^\u0600-\u06FF\s]') # Persian Unicode range
# Find all non-Persian characters in the text
non_persian_characters = non_persian_pattern.findall(text)
# Check if there are more than 2 non-Persian characters
return len(non_persian_characters) > 2
def process(text):
return text.replace('\u064B','').replace('\u064C','') \ .replace('\u064D','').replace('\u064E','').replace('\u064F','').replace('\u0650','').replace('\u0651','').replace('\u0652','').replace('\u200e','')
words = dict()
for line in lines:
for word in line.split(' '):
if word in words:
words[word] = words[word] + 1
else:
words[word] = 0
my_list = [process(word) for word,count in words.items() if count > 3 and not contains_more_than_2_non_persian(word)]
I have also generated some persian numbers and dates and added that to dict.
Currently, Farsi Numbers and special characters are not supported. Here is a PR to add support for them.
Also, These words are extract from wikipedia corpus and then processed with a code like this. So, this should be better suited for training purposes.
I have also generated some persian numbers and dates and added that to dict.