CL visualization - Githubissues

rezaBarzgar commented 1 year ago

For the ECIR24 paper, we need to demonstrate how the model performs on easy and hard samples. In the context of team formation, an easy sample can be a member who appears in many teams, while a hard sample can be a member who appears in a few teams. We need to plot the average loss of one popular and one non-popular member in each epoch. The x-axis represents the epochs, and the y-axis represents the average loss of the members that each expert belongs to. @MarcoKurepa, you need to write a code snippet that takes two expert IDs as inputs and plots the desired figure. We already have the popularity label for each expert. I'll send you that. I want you to find the extreme cases so that the plot can easily show the idea behind our work.

here is the TODO list:

[x] find the top 10 popular and non-popular experts
[x] write a code that computes the average loss for the given expert-ids at the end of training in each epoch
[x] plot the figure based on the computed average loss

the output should be something like this:

CurriculumLearning-Page-1 drawio

rezaBarzgar commented 1 year ago

@MarcoKurepa Please post your code snippets here as well. Also update the Todos in the comment above. thanks

MarcoKurepa commented 1 year ago

I don't think I have the necessary permissions to update the task list. Here are the code snippets:

DBLP Top 10 Popular and Non-Popular Experts


from collections import defaultdict

with open('../data/raw/dblp/toy.dblp.v12.json', 'r') as file:

    dblp_toy_data = json.load(file)

expert_popularity = defaultdict(int)

for publication in dblp_toy_data:

    authors = publication.get("authors", [])

    for author in authors:

        expert_id = author["id"]

        expert_popularity[expert_id] += 1

sorted_experts = sorted(expert_popularity.items(), key=lambda x: x[1], reverse=True)

top_10_experts = sorted_experts[:10]

bottom_10_experts = sorted_experts[-10:]

# Print the results

print("Top 10 Most Popular Experts:", top_10_experts)

print("Top 10 Least Popular Experts:", bottom_10_experts)```

**Github Top 10 Popular and Non-Popular Experts**

```import csv

import ast

import pickle

import os

csv.field_size_limit(2**31-1)  # maximum limit for 32-bit Python

filename = '../data/raw/gith/data.csv'

pickle_file = 'expert_popularity.pkl'

if os.path.exists(pickle_file):

    with open(pickle_file, 'rb') as f:

        expert_popularity = pickle.load(f)

else:

    expert_popularity = {}

with open(filename, 'r', encoding='ISO-8859-1') as file:

    reader = csv.DictReader(file)

    for idx, row in enumerate(reader, 1):

        try:

            collabs = ast.literal_eval(row['collabs'])

            for collab in collabs:

                if isinstance(collab, dict) and "login" in collab:

                    expert_id = collab["login"]

                    expert_popularity[expert_id] = expert_popularity.get(expert_id, 0) + 1

        except (ValueError, KeyError):

            # Handle rows where 'collabs' might be malformed or missing

            pass

        if idx % 10000 == 0:

            with open(pickle_file, 'wb') as f:

                pickle.dump(expert_popularity, f)

    with open(pickle_file, 'wb') as f:

        pickle.dump(expert_popularity, f)

with open(pickle_file, 'rb') as f:

    expert_popularity = pickle.load(f)

sorted_experts = sorted(expert_popularity.items(), key=lambda x: x[1], reverse=True)

top_10_experts = sorted_experts[:10]

bottom_10_experts = sorted_experts[-10:]

print("Top 10 Most Popular Experts:", top_10_experts)

print("Top 10 Least Popular Experts:", bottom_10_experts)```

**Plot Based on Computed Average Loss**

```import os
import pickle
import matplotlib.pyplot as plt
import numpy as np

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

def load_average_losses(output_path, foldidx):
    average_losses = []
    epoch = 0
    while True:
        file_name = f"fold_{foldidx}_average_losses_epoch_{epoch}.pkl"
        file_path = os.path.join(output_path, file_name)
        if not os.path.exists(file_path):
            print(f"Warning: Expected file {file_path} does not exist. Stopping epoch loading.")
            break
        with open(file_path, "rb") as f:
            average_losses_epoch = pickle.load(f)
            average_losses_epoch = {int(k.item()): v for k, v in average_losses_epoch.items()}
            average_losses.append(average_losses_epoch)
        epoch += 1
    return average_losses

def plot_expert_losses(output_path, expert_ids):
    for foldidx in range(3): 
        average_losses = load_average_losses(output_path, foldidx)
        epochs = list(range(len(average_losses)))

        plt.figure(figsize=(10, 6))
        for expert_id in expert_ids:
            # Calculate the mean of the arrays for each epoch
            losses = [np.mean(average_losses[epoch].get(expert_id, np.array([0]))) for epoch in epochs]
            plt.plot(epochs, losses, label=f'Expert {expert_id}')
            print(f"Losses for Expert {expert_id} in Fold {foldidx}: {losses}")

        plt.title(f'Fold {foldidx} - Average Loss per Expert per Epoch')
        plt.xlabel('Epochs')
        plt.ylabel('Average Loss')
        plt.legend()
        plt.savefig(os.path.join(output_path, f'fold_{foldidx}_loss_plot.png'))
        plt.close()

if __name__ == "__main__":
    output_path = '../output/average_losses/'
    expert_ids = [9, 6]  # Example expert IDs, adjust as needed
    plot_expert_losses(output_path, expert_ids)

Code Snippet Added to Model File to Log Loss of Each Expert This snippet should be added after y_ = self.forward(X) in the main training loop under the learn method.


                            for idx, individual_loss in zip(index, individual_losses):
                                scalar_loss = torch.mean(individual_loss).cpu().detach().numpy()

                                if idx not in losses_dict:
                                    losses_dict[idx] = []
                                losses_dict[idx].append(scalar_loss)

                            average_loss = {expert_id: sum(losses) / len(losses) for expert_id, losses in losses_dict.items()}

                            with open(f"./../output/average_losses/fold_{foldidx}_average_losses_epoch_{epoch}.pkl", "wb") as f:
                                pickle.dump(average_loss, f)```

I won't be available until late next week to continue working on this issue, so if you need the top/bottom 10 for IMDb and USPT before then you can refer to these code snippets as it should be more or less the same. For the plots, these code snippets should work out of the box on any dataset for model, we'll just need to rerun the experiments with them included.

rezaBarzgar commented 1 year ago

@MarcoKurepa Thanks for your help. I found an easier way to do this task. I put the code here for you if you are interested to read it.


def get_popular_nonpopular_ids(top=0):
    """

    Parameters
    ----------
    top if 0 -> all experts else top k popular/non-popular experts

    Returns
    -------

    """
    ids = pd.read_csv('new_popularity.csv')
    non_popular_ids, popular_ids = ids.groupby(by=['popularity'])
    if top == 0:
        return non_popular_ids[1]['memberidx'].tolist(), popular_ids[1]['memberidx'].tolist()
    else:
        return non_popular_ids[1].sort_values('occurrence')['memberidx'].head(top).tolist(), \
               popular_ids[1].sort_values('occurrence')['memberidx'].tail(top).tolist()

def plot_popular_vs_nonpopular(popularity_loss_values: dict, path: str):
    """

    Parameters
    ----------
    popularity_loss_values = {'popular' = [__loss values of popular expert(s)__],
                                'non-popular' = [__loss values of non-popular expert(s)__]
                                }
    path

    Returns
    -------
    save and show a plot
    """
    fig = plt.figure(figsize=(3, 3))
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)

    # Extract data from the popularity_loss_values dictionary
    x = range(1, len(popularity_loss_values['popular']) + 1)
    y1 = popularity_loss_values['popular']
    y2 = popularity_loss_values['non-popular']

    # Create a figure and axis
    plt.title('popular vs. non-Popular expert')
    plt.xlabel('epoch')
    plt.ylabel('ave. loss')

    # Plot the two lines
    plt.plot(x, y1, label='most popular expert', marker='o')
    plt.plot(x, y2, label='most non-Popular expert', marker='x')

    # Add a legend to distinguish the lines
    plt.legend(fontsize="8")

    # Show the plot
    plt.grid(True)
    plt.savefig(f'{path}.pdf', format='pdf')
    plt.show()

def add_popularity_occurrence_number(vecs):
    """

    Parameters:
    ----------
    vecs = {'id': numpy matrix,
            'skill': lil_matrix,
            'member': lil_matrix
            }

    Returns
    -------
    save a new popularity file with occurrence number for each expert
    """
    # ids = pd.read_csv('../data/popularity.csv')
    teamids, skillvecs, teamsvecs_members = vecs['id'], vecs['skill'], vecs['member']
    stats = {}
    stats['*nmembers'] = teamsvecs_members.shape[1]
    col_sums = teamsvecs_members.sum(axis=0)

    stats['nteams_candidate-idx'] = {k: v for k, v in enumerate(sorted(col_sums.A1.astype(int), reverse=True))}
    stats['*avg_nteams_member'] = col_sums.mean()
    threshold = 1 * stats['*avg_nteams_member']

    ids = pd.DataFrame(data=[True if threshold <= nteam_member else False for nteam_member in
                             col_sums.getA1()], columns=['popularity'])

    ids['occurrence'] = vecs['member'].sum(axis=0).tolist()[0]
    ids['memberidx'] = [i for i in range(vecs['member'].shape[1])]
    ids.to_csv('./new_popularity.csv', index=False)```

fani-lab / OpeNTF

CL visualization #215