bounswe / bounswe2024group5

This repo contains all the work done by group 5 in cmpe352/2024 Spring course.
10 stars 0 forks source link

Quiz Generation Proof of Concept #265

Open odenizddd opened 2 weeks ago

odenizddd commented 2 weeks ago

Description

Create a basic application to make sure that we can reliablly generate quiz questions using one or more linked data resources for a given word.

Tasks

Estimated Time

7 days

Deadline

08.10.2024

Reviewer

@set120120 and @fahreddinozcan

ramazanoacar commented 1 week ago

This is very basic form of giving the translation of a turkish word in english

import requests
import json
import random

BABELNET_API_KEY = ":)"

# Function to get BabelNet translations
def get_babelnet_translations(word):
    url = "https://babelnet.io/v5/getSenses"
    params = {
        'lemma': word,
        'searchLang': 'TR',  # Searching in Turkish
        'targetLang': 'EN',  # Targeting English
        'key': BABELNET_API_KEY
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = json.loads(response.text)
        if data:
            translations = []
            count = 0

            for sense in data:
                if sense['properties']['language'] == 'EN':  # Collecting English translations
                    translation = sense['properties']['fullLemma']
                    if translation not in translations:  # Avoid duplicates
                        translations.append(translation)
                    count += 1
                if count >= 4:  # Limit to 4 translations
                    break

            return translations  # Return the list of translations
    else:
        print(f"Error fetching data from BabelNet API: {response.status_code}")
    return []

# Function to check for unwanted punctuation in translations
def has_unwanted_punctuation(translation):
    unwanted_chars = [')', '(', ',', '.']
    return any(char in translation for char in unwanted_chars)

# Get a Turkish word from the user
word = input("Enter a Turkish word: ").strip()

# Get the translations from BabelNet
translations = get_babelnet_translations(word)

# Print the translations
if translations:
    print(f"\nTranslations for '{word}':")
    for i, translation in enumerate(translations, start=1):
        print(f"{i}. {translation}")
else:
    print(f"No translations found for the word '{word}' in BabelNet.")
ramazanoacar commented 1 week ago

And this one is for generating a multiple choice question for a given english word:

import nltk
from nltk.corpus import wordnet as wn
import requests
import json
import random

BABELNET_API_KEY = ":)"

# Function to get BabelNet translations
def get_babelnet_translations(word):
    url = "https://babelnet.io/v5/getSenses"
    params = {
        'lemma': word,
        'searchLang': 'EN',
        'targetLang': 'TR',
        'key': BABELNET_API_KEY
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = json.loads(response.text)
        if data:
            translations = []
            count = 0
            for sense in data:
                if sense['properties']['language'] == 'TR':
                    count += 1
                    translations.append(sense['properties']['fullLemma'])
                if count == 100:
                    break

            # Process translations to create a dictionary
            translations_dict = {}
            for translation in translations:
                # Split on underscores and process
                translation_parts = [t.lower() for t in translation.split("_")]
                for t in translation_parts:
                    if t != word.lower() and (not has_unwanted_punctuation(t)):
                        translations_dict[t] = translations_dict.get(t, 0) + 1

            # Get the most common translation
            correct_translation = max(translations_dict, key=translations_dict.get, default="No translation found")
            return correct_translation
    else:
        print(f"Error fetching data from BabelNet API: {response.status_code}")
    return "No translation found"

# Function to check for unwanted punctuation in translations
def has_unwanted_punctuation(translation):
    unwanted_chars = [')', '(', ',', '.']
    return any(char in translation for char in unwanted_chars)

# Function to get children of the hypernym of the given word
def get_hyponyms_of_hypernym(word, correct_translation):
    syns = wn.synsets(word)
    hyponyms = []

    if syns:
        # Get hypernyms (parents)
        for hypernym in syns[0].hypernyms():
            # Get hyponyms (children)
            for hyponym in hypernym.hyponyms():
                # Get BabelNet translations
                translation = get_babelnet_translations(hyponym.lemmas()[0].name())
                # Extend if not the same as the word and doesn't have unwanted punctuation
                translation_lower = translation.lower()
                if translation_lower != word.lower() and translation_lower != correct_translation and not has_unwanted_punctuation(translation):
                    hyponyms.append(translation_lower)

    return list(set(hyponyms))  # Return unique hyponyms

# Function to get synonyms or related words with BabelNet translations
def get_related_words(word):
    related_words = []
    synsets = wn.synsets(word)

    for syn in synsets:
        for lemma in syn.lemmas():
            related_word = lemma.name()
            if related_word.lower() != word.lower():
                translation = get_babelnet_translations(related_word)
                if translation != "No translation found" and translation.lower() != word.lower() and not has_unwanted_punctuation(translation):
                    related_words.append(translation)

    return list(set(related_words))  # Return unique related words

# Function to add random unrelated words using BabelNet translations
def add_random_unrelated_words(choices, num_needed):
    all_lemmas = list(wn.all_lemma_names())  # Convert to list
    choices_set = set(choice.lower() for choice in choices)  # Create a set of current choices
    added_count = 0  # Count of added unrelated words

    while added_count < num_needed:
        random_word = random.choice(all_lemmas)  # Choose from the list
        translation = get_babelnet_translations(random_word)
        # Check for unwanted punctuation and add the translation if valid
        if (translation != "No translation found" and 
            translation.lower() not in choices_set and 
            (not has_unwanted_punctuation(translation))
            and random_word.lower() != word.lower()):
            choices.append(translation)
            added_count += 1  # Increment the count of added words

# Get an English word from the user
word = input("Enter an English word: ").strip()
syns = wn.synsets(word)

# Check if synsets are found in WordNet
if syns:
    # Get the correct translation from BabelNet
    correct_translation = get_babelnet_translations(word)
    # print(f"Correct translation: {correct_translation}")

    # Generate hyponyms of the hypernym
    hyponyms = get_hyponyms_of_hypernym(word, correct_translation)

    # Create choices for the multiple-choice question
    choices = hyponyms.copy()
    if correct_translation.lower() != "no translation found" and correct_translation.lower() not in [c.lower() for c in choices]:
        choices.append(correct_translation)

    # Ensure we have enough choices
    if len(choices) < 4:
        # Try to add related words
        related_words = get_related_words(word)
        choices.extend(related_words)

    # Check again if there are enough choices
    if len(choices) < 4:
        # Add random unrelated words using BabelNet translations
        add_random_unrelated_words(choices, 4)

    # Ensure we have at least 4 unique choices
    choices = list(set(choices))  # Remove duplicates
    if len(choices) < 4:
        print("Not enough unique choices generated.")
    else:
        # Randomly select 3 additional choices
        random_choices = random.sample(choices, min(3, len(choices)))
        random_choices.append(correct_translation)

        # Shuffle choices
        random.shuffle(random_choices)

        # Present the multiple-choice question
        print(f"\nWhich of the following is the correct translation of the word '{word}' in Turkish?")
        for i, choice in enumerate(random_choices, start=1):
            print(f"{i}. {choice}")

        # Check answer (you can later ask for input)
        print(f"\nCorrect answer is: {correct_translation}")
else:
    print(f"No synsets found for the word '{word}' in WordNet.")
odenizddd commented 1 week ago

Hi @ramazanoacar . How well do those scripts perform in your opinion. Have you observed any issues?

ramazanoacar commented 1 week ago

Good Night @odenizddd , yes I have observed many issues initially but I was able to overcome some of them. The main reason for them may actually stem from my not being able to pass the id between wordnet and babelnet. Potentially because of this, I faced the below issues:

  1. There were lots of unrelated words and spam for a given word's translation, therefore I needed to add a very basic and straight validation mechanism for the correct translation.

  2. Also, sometimes it was not possible to overcome the spams because the same word was occuring more than the original translation, for overcoming some specific kinds of this issue, I tried to remove the translated words which include ")" , "(", etc.

  3. Now, it works a little better but it still have issues, I tried some basic words like dog, house, home, they worked fine. However, I faced a slight error when I tried the word happy. The output was "mutluluk" instead of "mutlu". It still gave something related but I couldn't overcome the issue with suffixes and similarity now.

TLDR: It may cause some errors but for some level, It satisfies our needs. It may be improved to perfection if passing the id between wordnet and babelnet is implemented correctly.

odenizddd commented 1 week ago

I don't know if the following code performs well in a wide range of words however it does for the word 'happy' and it illustrates how I referenced wordnet id to fetch translations from babelnet api.

Hope this helps @ramazanoacar .

from nltk.corpus import wordnet as wn
import requests

def getWordnetId(word):
    sense = wn.synsets(word)[0]
    wordnet_id = str(sense.offset())
    wordnet_id = f"wn:{wordnet_id.zfill(8)}{sense.pos()}"
    return wordnet_id

def getTranslationsWithWordnetId(wordnet_id):

    API_KEY = ':)'

    url = f"https://babelnet.io/v9/getSynset"

    params = {
        'id': wordnet_id,
        'key': API_KEY,
        'targetLang': 'TR'
    }

    response = requests.get(url, params)

    assert response.status_code == 200, f"Error: {response.text}"

    response = response.json()

    translations = set()

    for sense in response['senses']:
        if sense['properties']['lemma']['type'] == 'HIGH_QUALITY':
            translations.add(sense['properties']['fullLemma'])

        # print(sense['properties']['fullLemma'])
        # print(sense['properties']['source'])
        # print(sense)
        # print()

    return translations
    # print(translations)

print(getTranslationsWithWordnetId(getWordnetId('happy')))

Output is:

{'mutlu'}

ebrarkiziloglu commented 1 week ago

Here is the script I worked on to obtain the synonyms and similar words from Wordnet:

import json
import nltk
from nltk.corpus import wordnet as wn

words_to_check = ['win', 'dog', 'small', 'fast', 'happy', 'bad', 'rich', 'new', 'young', 'play']

most_common_words_in_english_according_to_polysemy = ['take', 'go', 'make', 'get', 'back', 'good', 'work', 'have', 'one', 'even', 'come', 'for', 'what', 'about', 'give', 'new']
# Polysemy is the capacity for a word to have multiple related meanings.

similarity_database = {}
definition_database = {}
for word in most_common_words_in_english_according_to_polysemy + words_to_check:
    similar_words = {}
    definition_database[word] = []

    for ss in wn.synsets(word):
        definition_database[word].append(ss.definition())
        for similar_word in ss.lemma_names():
            similar_word = similar_word.lower()
            if similar_word != word:
                if similar_word not in similar_words:
                    similar_words[similar_word] = 1
                else:
                    similar_words[similar_word] += 1
        for sim in ss.similar_tos():
            similar_word = sim.name().split(".")[0].lower()
            if similar_word != word:
                # print(similar_word)
                if similar_word not in similar_words:
                    similar_words[similar_word] = 1
                else:
                    similar_words[similar_word] += 1

    similarity_database[word] = dict(sorted(similar_words.items(), key=lambda item: item[1], reverse=True))

K = 7   # Number of definitions and similar words to print
output = []

for word, similar_words in similarity_database.items():
    word_data = {
        "Word": word,
        "Definitions": definition_database[word][0: K],
        "Similar words": dict(list(similar_words.items())[0: K])
    }
    output.append(word_data)

json_output = json.dumps(output, indent=4)
with open('wordnet_output.json', 'w') as json_file:
    json_file.write(json_output)
ramazanoacar commented 1 week ago

We talked and argued about the PoC scripts that provided above in our meeting on 06.10.2024. Also, we went over @HutkuC implementation with an open source dictionary. We decided that these are enough for the PoC and will decide the final method of implementation in the upcoming lab sessions as the whole group.

set120120 commented 1 week ago

We've talked with whole group in lab session. The necessary notes were taken. I think we can close this issue @fahreddinozcan what are your thoughts?