SforAiDl / genrl

A PyTorch reinforcement learning library for generalizable and reproducible algorithm implementations with an aim to improve accessibility in RL
https://genrl.readthedocs.io
MIT License
403 stars 59 forks source link

Evaluating performance of contextual bandit agents in examples #314

Open TMorville opened 4 years ago

TMorville commented 4 years ago

I have been playing around with the DCBTrainer and found some potential inconsistencies.

1) StatlogData example found here

from genrl.utils import StatlogDataBandit

bandit = StatlogDataBandit(download=True)
context = bandit.reset()

from genrl.agents import NeuralLinearPosteriorAgent

agent = NeuralLinearPosteriorAgent(bandit)
context = bandit.reset()

action = agent.select_action(context)
new_context, reward = bandit.step(action)

from genrl.trainers import DCBTrainer

trainer = DCBTrainer(agent, bandit)
trainer.train(timesteps=1000, batch_size=32)

and code to evaluate

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

def _evaluate(trainer, bandit):

    y_true = bandit.df.iloc[:, -1].to_numpy()

    class_distribution = bandit.df.iloc[:, -1].value_counts()
    most_freq_class = class_distribution.idxmax()
    baseline_accuracy = accuracy_score(y_true, np.resize(most_freq_class, len(bandit.df))).round(2)

    tensor_matrix = torch.stack([torch.LongTensor(x).float() for x in bandit.df.iloc[:, :-1].to_numpy()])    

    y_pred = []

    for i in tensor_matrix:
        y_pred.append(trainer.agent.select_action(i).item()) 

    print("Baseline accuracy score: {}%".format(baseline_accuracy))
    print("After {} steps accuracy is {}%".format(agent.t, accuracy_score(y_true, y_pred).round(2)))
    print("Classification report")    
    print(classification_report(y_true, y_pred))

    fig = plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.plot(bandit.cum_reward_hist)
    plt.title("Cumulative reward")
    plt.subplot(122)
    plt.plot(bandit.cum_regret_hist)
    plt.title("Cumulative regret")
    plt.tight_layout()

    return y_true, y_pred

yt, yp = _evaluate(trainer, bandit)

Baseline accuracy score: 0.78%
After 44501 steps accuracy is 0.78%
Classification report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.78      0.99      0.88     34108
           2       0.00      0.00      0.00        37
           3       0.00      0.00      0.00       132
           4       0.14      0.00      0.00      6748
           5       0.17      0.00      0.00      2458
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00        11

    accuracy                           0.78     43500
   macro avg       0.14      0.12      0.11     43500
weighted avg       0.65      0.78      0.69     43500

2) WineDataBandit example found here

Define the bandit

from typing import Tuple

import pandas as pd
import torch

from genrl.utils.data_bandits.base import DataBasedBandit
from genrl.utils.data_bandits.utils import download_data

URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

class WineDataBandit(DataBasedBandit):
    def __init__(self, **kwargs):
        super(WineDataBandit, self).__init__(**kwargs)

        path = kwargs.get("path", "./data/Wine/")
        download = kwargs.get("download", None)
        force_download = kwargs.get("force_download", None)
        url = kwargs.get("url", URL)

        if download:
            path = download_data(path, url, force_download)

        self._df = pd.read_csv(path, header=None)
        self.n_actions = len(self._df[0].unique())
        self.context_dim = self._df.shape[1] - 1
        self.len = len(self._df)

    def reset(self) -> torch.Tensor:
        self._reset()
        self.df = self._df.sample(frac=1).reset_index(drop=True)
        return self._get_context()

    def _compute_reward(self, action: int) -> Tuple[int, int]:
        label = self._df.iloc[self.idx, 0]
        r = int(label == (action + 1))
        return r, 1

    def _get_context(self) -> torch.Tensor:
        return torch.tensor(
            self._df.iloc[self.idx, 1:].values,
            device=self.device,
            dtype=torch.float,
        )

training

bandit = WineDataBandit(path='/path/to/data')

from genrl.agents import NeuralLinearPosteriorAgent

agent = NeuralLinearPosteriorAgent(bandit)
context = bandit.reset()

action = agent.select_action(context)
new_context, reward = bandit.step(action)

from genrl.trainers import DCBTrainer

trainer = DCBTrainer(agent, bandit)
trainer.train(timesteps=5000, batch_size=32)

and evaluation

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

def _evaluate(df, trainer, bandit):

    y_true = bandit.df.iloc[:, 0].to_numpy()

    class_distribution = bandit.df.iloc[:, 0].value_counts()
    most_freq_class = class_distribution.idxmax()
    baseline_accuracy = accuracy_score(y_true, np.resize(most_freq_class, len(bandit.df))).round(2)

    tensor_matrix = torch.stack([torch.LongTensor(x).float() for x in bandit.df.iloc[:, 1:].to_numpy()])

    y_pred = []

    for i in tensor_matrix:
        y_pred.append(trainer.agent.select_action(i).item()) 

    print("Baseline accuracy score: {}%".format(baseline_accuracy))
    print("After {} steps accuracy is {}%".format(agent.t, accuracy_score(y_true, y_pred).round(2)))
    print("Classification report")    
    print(classification_report(y_true, y_pred))

    fig = plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.plot(bandit.cum_reward_hist)
    plt.title("Cumulative reward")
    plt.subplot(122)
    plt.plot(bandit.cum_regret_hist)
    plt.title("Cumulative regret")
    plt.tight_layout()

    return y_true, y_pred

yt, yp = _evaluate(trainer, bandit)

Baseline accuracy score: 0.4%
After 5357 steps accuracy is 0.0%
Classification report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00      59.0
           2       0.00      0.00      0.00      71.0
           3       0.00      0.00      0.00      48.0

    accuracy                           0.00     178.0
   macro avg       0.00      0.00      0.00     178.0
weighted avg       0.00      0.00      0.00     178.0

For both cases (and the third titanic case referenced in #301 ) when training both reward and regret increases which could point to no actual learning happening and the increase in reward comes purely from randomly guessing instead of learning.

Notice that for the Statlog data, the evaluation col is the last column, while in the wine data, its the first column.

sampreet-arthi commented 3 years ago

Is this still an issue?

TMorville commented 3 years ago

It is. However, I think my baseline might be wrong.

I think the relevant baseline to compare performance to should be a bayesian regression trained directly on the data, instead of the output from the neural network. Do you agree?