Closed rezaBarzgar closed 1 year ago
@MarcoKurepa Please post your code snippets here as well. Also update the Todos in the comment above. thanks
I don't think I have the necessary permissions to update the task list. Here are the code snippets:
DBLP Top 10 Popular and Non-Popular Experts
from collections import defaultdict
with open('../data/raw/dblp/toy.dblp.v12.json', 'r') as file:
dblp_toy_data = json.load(file)
expert_popularity = defaultdict(int)
for publication in dblp_toy_data:
authors = publication.get("authors", [])
for author in authors:
expert_id = author["id"]
expert_popularity[expert_id] += 1
sorted_experts = sorted(expert_popularity.items(), key=lambda x: x[1], reverse=True)
top_10_experts = sorted_experts[:10]
bottom_10_experts = sorted_experts[-10:]
# Print the results
print("Top 10 Most Popular Experts:", top_10_experts)
print("Top 10 Least Popular Experts:", bottom_10_experts)```
**Github Top 10 Popular and Non-Popular Experts**
```import csv
import ast
import pickle
import os
csv.field_size_limit(2**31-1) # maximum limit for 32-bit Python
filename = '../data/raw/gith/data.csv'
pickle_file = 'expert_popularity.pkl'
if os.path.exists(pickle_file):
with open(pickle_file, 'rb') as f:
expert_popularity = pickle.load(f)
else:
expert_popularity = {}
with open(filename, 'r', encoding='ISO-8859-1') as file:
reader = csv.DictReader(file)
for idx, row in enumerate(reader, 1):
try:
collabs = ast.literal_eval(row['collabs'])
for collab in collabs:
if isinstance(collab, dict) and "login" in collab:
expert_id = collab["login"]
expert_popularity[expert_id] = expert_popularity.get(expert_id, 0) + 1
except (ValueError, KeyError):
# Handle rows where 'collabs' might be malformed or missing
pass
if idx % 10000 == 0:
with open(pickle_file, 'wb') as f:
pickle.dump(expert_popularity, f)
with open(pickle_file, 'wb') as f:
pickle.dump(expert_popularity, f)
with open(pickle_file, 'rb') as f:
expert_popularity = pickle.load(f)
sorted_experts = sorted(expert_popularity.items(), key=lambda x: x[1], reverse=True)
top_10_experts = sorted_experts[:10]
bottom_10_experts = sorted_experts[-10:]
print("Top 10 Most Popular Experts:", top_10_experts)
print("Top 10 Least Popular Experts:", bottom_10_experts)```
**Plot Based on Computed Average Loss**
```import os
import pickle
import matplotlib.pyplot as plt
import numpy as np
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
def load_average_losses(output_path, foldidx):
average_losses = []
epoch = 0
while True:
file_name = f"fold_{foldidx}_average_losses_epoch_{epoch}.pkl"
file_path = os.path.join(output_path, file_name)
if not os.path.exists(file_path):
print(f"Warning: Expected file {file_path} does not exist. Stopping epoch loading.")
break
with open(file_path, "rb") as f:
average_losses_epoch = pickle.load(f)
average_losses_epoch = {int(k.item()): v for k, v in average_losses_epoch.items()}
average_losses.append(average_losses_epoch)
epoch += 1
return average_losses
def plot_expert_losses(output_path, expert_ids):
for foldidx in range(3):
average_losses = load_average_losses(output_path, foldidx)
epochs = list(range(len(average_losses)))
plt.figure(figsize=(10, 6))
for expert_id in expert_ids:
# Calculate the mean of the arrays for each epoch
losses = [np.mean(average_losses[epoch].get(expert_id, np.array([0]))) for epoch in epochs]
plt.plot(epochs, losses, label=f'Expert {expert_id}')
print(f"Losses for Expert {expert_id} in Fold {foldidx}: {losses}")
plt.title(f'Fold {foldidx} - Average Loss per Expert per Epoch')
plt.xlabel('Epochs')
plt.ylabel('Average Loss')
plt.legend()
plt.savefig(os.path.join(output_path, f'fold_{foldidx}_loss_plot.png'))
plt.close()
if __name__ == "__main__":
output_path = '../output/average_losses/'
expert_ids = [9, 6] # Example expert IDs, adjust as needed
plot_expert_losses(output_path, expert_ids)
Code Snippet Added to Model File to Log Loss of Each Expert
This snippet should be added after y_ = self.forward(X)
in the main training loop under the learn
method.
for idx, individual_loss in zip(index, individual_losses):
scalar_loss = torch.mean(individual_loss).cpu().detach().numpy()
if idx not in losses_dict:
losses_dict[idx] = []
losses_dict[idx].append(scalar_loss)
average_loss = {expert_id: sum(losses) / len(losses) for expert_id, losses in losses_dict.items()}
with open(f"./../output/average_losses/fold_{foldidx}_average_losses_epoch_{epoch}.pkl", "wb") as f:
pickle.dump(average_loss, f)```
I won't be available until late next week to continue working on this issue, so if you need the top/bottom 10 for IMDb and USPT before then you can refer to these code snippets as it should be more or less the same. For the plots, these code snippets should work out of the box on any dataset for model, we'll just need to rerun the experiments with them included.
@MarcoKurepa Thanks for your help. I found an easier way to do this task. I put the code here for you if you are interested to read it.
def get_popular_nonpopular_ids(top=0):
"""
Parameters
----------
top if 0 -> all experts else top k popular/non-popular experts
Returns
-------
"""
ids = pd.read_csv('new_popularity.csv')
non_popular_ids, popular_ids = ids.groupby(by=['popularity'])
if top == 0:
return non_popular_ids[1]['memberidx'].tolist(), popular_ids[1]['memberidx'].tolist()
else:
return non_popular_ids[1].sort_values('occurrence')['memberidx'].head(top).tolist(), \
popular_ids[1].sort_values('occurrence')['memberidx'].tail(top).tolist()
def plot_popular_vs_nonpopular(popularity_loss_values: dict, path: str):
"""
Parameters
----------
popularity_loss_values = {'popular' = [__loss values of popular expert(s)__],
'non-popular' = [__loss values of non-popular expert(s)__]
}
path
Returns
-------
save and show a plot
"""
fig = plt.figure(figsize=(3, 3))
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
# Extract data from the popularity_loss_values dictionary
x = range(1, len(popularity_loss_values['popular']) + 1)
y1 = popularity_loss_values['popular']
y2 = popularity_loss_values['non-popular']
# Create a figure and axis
plt.title('popular vs. non-Popular expert')
plt.xlabel('epoch')
plt.ylabel('ave. loss')
# Plot the two lines
plt.plot(x, y1, label='most popular expert', marker='o')
plt.plot(x, y2, label='most non-Popular expert', marker='x')
# Add a legend to distinguish the lines
plt.legend(fontsize="8")
# Show the plot
plt.grid(True)
plt.savefig(f'{path}.pdf', format='pdf')
plt.show()
def add_popularity_occurrence_number(vecs):
"""
Parameters:
----------
vecs = {'id': numpy matrix,
'skill': lil_matrix,
'member': lil_matrix
}
Returns
-------
save a new popularity file with occurrence number for each expert
"""
# ids = pd.read_csv('../data/popularity.csv')
teamids, skillvecs, teamsvecs_members = vecs['id'], vecs['skill'], vecs['member']
stats = {}
stats['*nmembers'] = teamsvecs_members.shape[1]
col_sums = teamsvecs_members.sum(axis=0)
stats['nteams_candidate-idx'] = {k: v for k, v in enumerate(sorted(col_sums.A1.astype(int), reverse=True))}
stats['*avg_nteams_member'] = col_sums.mean()
threshold = 1 * stats['*avg_nteams_member']
ids = pd.DataFrame(data=[True if threshold <= nteam_member else False for nteam_member in
col_sums.getA1()], columns=['popularity'])
ids['occurrence'] = vecs['member'].sum(axis=0).tolist()[0]
ids['memberidx'] = [i for i in range(vecs['member'].shape[1])]
ids.to_csv('./new_popularity.csv', index=False)```
For the ECIR24 paper, we need to demonstrate how the model performs on easy and hard samples. In the context of team formation, an easy sample can be a member who appears in many teams, while a hard sample can be a member who appears in a few teams. We need to plot the average loss of one popular and one non-popular member in each epoch. The x-axis represents the epochs, and the y-axis represents the average loss of the members that each expert belongs to. @MarcoKurepa, you need to write a code snippet that takes two expert IDs as inputs and plots the desired figure. We already have the popularity label for each expert. I'll send you that. I want you to find the extreme cases so that the plot can easily show the idea behind our work.
here is the TODO list:
the output should be something like this: