ConvLab / ConvLab-3

Apache License 2.0
107 stars 30 forks source link

[Maintenance] Outdated end2end evaluation script #186

Closed JamesCao2048 closed 8 months ago

JamesCao2048 commented 10 months ago

Describe the feature A clear and concise description of what the feature is. The end2end evaluation scripts in examples/agent_examples/ seem to be directly copied from ConvLab2, thus have some problems. For example,

  1. The RulePolicy seems not to be supported in ConvLab3 now.
  2. The default dataset ofPPOPolicy in test_BERTNLU-RuleDST-PPOPolicy-TemplateNLG.py is Multiwoz, which is not supported in ConvLab3.

There is a Getting_Started.ipynb point in tutorials/Getting_Started.ipynb, but it only provides an interactive test, not an end2end test. I add a user pipeline agent to it as below,

from convlab.base_models.t5.nlu import T5NLU
from convlab.base_models.t5.dst import T5DST
from convlab.base_models.t5.nlg import T5NLG
from convlab.nlu.jointBERT.multiwoz import BERTNLU
from convlab.policy.vector.vector_nodes import VectorNodes
from convlab.policy.rule.multiwoz import RulePolicy
from convlab.policy.vtrace_DPT import VTRACE
from convlab.dialog_agent import PipelineAgent, BiSession
from convlab.evaluator.multiwoz_eval import MultiWozEvaluator
from convlab.nlg.template.multiwoz import TemplateNLG
from pprint import pprint
import random
import numpy as np
import torch
from convlab.util.analysis_tool.analyzer import Analyzer
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from convlab.policy.tus.multiwoz.TUS import UserPolicy
from convlab.dst.rule.multiwoz.usr_dst import UserRuleDST
import json

def set_seed(r_seed):
    random.seed(r_seed)
    np.random.seed(r_seed)
    torch.manual_seed(r_seed)

def test_end2end(seed=20200202, n_dialogues=1000):
    # go to README.md of each model for more information
    sys_nlu = T5NLU(speaker='user', context_window_size=0, model_name_or_path='../../models/t5-small-nlu-multiwoz21')
    sys_dst = T5DST(dataset_name='multiwoz21', speaker='user', context_window_size=100, model_name_or_path='../../models/t5-small-dst-multiwoz21')
    # Download pre-trained DDPT model
    # ! wget https://huggingface.co/ConvLab/ddpt-policy-multiwoz21/resolve/main/supervised.pol.mdl --directory-prefix="convlab/policy/vtrace_DPT"
    vectorizer = VectorNodes(dataset_name='multiwoz21',
                            use_masking=True,
                            manually_add_entity_names=True,
                            seed=0,
                            filter_state=True)
    sys_policy = VTRACE(is_train=False,
                seed=0,
                vectorizer=vectorizer,
                load_path="convlab/policy/vtrace_DPT/supervised")
    sys_nlg = T5NLG(speaker='system', context_window_size=0, model_name_or_path='../../models/t5-small-nlg-multiwoz21')
    # assemble
    sys_agent = PipelineAgent(sys_nlu, sys_dst, sys_policy, sys_nlg, name='sys')

     # specify the user config
    user_config = "/mnt/sda/cjm/ConvLab-3/convlab/policy/tus/multiwoz/exp/default.json"
    user_mode = ""
    # BERT nlu trained on sys utterance
    user_nlu = BERTNLU(mode='sys', config_file='multiwoz_sys_context.json',
                       model_file='/mnt/sda/cjm/ConvLab-3/models/bert_multiwoz_sys_context.zip')
    user_dst = None
    # rule policy
    user_config = json.load(open(user_config))

    if user_mode:
        user_config["model_name"] = f"{user_config['model_name']}-{user_mode}"
    user_policy = UserPolicy(user_config)
    # user_policy =  RulePolicy(character='usr')
    # template NLG
    user_nlg = TemplateNLG(is_user=True)
    # assemble
    user_agent = PipelineAgent(
        user_nlu, user_dst, user_policy, user_nlg, name='user')

    analyzer = Analyzer(user_agent=user_agent, dataset='multiwoz')

    set_seed(seed)
    name=f'BERTNLU-RuleDST-RulePolicy-TemplateNLG-Seed{seed}'
    analyzer.comprehensive_analyze(sys_agent=sys_agent, model_name=name, total_dialog=n_dialogues)

if __name__ == '__main__':
    # Get arguments
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', help='Seed', default=20200202, type=int)
    parser.add_argument('--n_dialogues', help='Number of eval dialogues', default=1000, type=int)
    args = parser.parse_args()

    test_end2end(seed=args.seed, n_dialogues=args.n_dialogues)

,but got this error

  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 86, in <module>
    test_end2end(seed=args.seed, n_dialogues=args.n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 77, in test_end2end
    analyzer.comprehensive_analyze(sys_agent=sys_agent, model_name=name, total_dialog=n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/convlab/util/analysis_tool/analyzer.py", line 119, in comprehensive_analyze
    sys_response, user_response, session_over, reward = sess.next_turn(
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/session.py", line 122, in next_turn
    user_response = self.next_response(last_observation)
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/session.py", line 96, in next_response
    response = next_agent.response(observation)
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/agent.py", line 176, in response
    self.output_action = deepcopy(self.policy.predict(state))
  File "/mnt/sda/cjm/ConvLab-3/convlab/policy/tus/multiwoz/TUS.py", line 420, in predict
    return self.policy.predict(state)
  File "/mnt/sda/cjm/ConvLab-3/convlab/policy/tus/multiwoz/TUS.py", line 81, in predict
    sys_dialog_act = state["system_action"]
TypeError: list indices must be integers or slices, not str

If I change the user policy to RulePolicy, I met this error,

Traceback (most recent call last):
  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 86, in <module>
    test_end2end(seed=args.seed, n_dialogues=args.n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 77, in test_end2end
    analyzer.comprehensive_analyze(sys_agent=sys_agent, model_name=name, total_dialog=n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/convlab/util/analysis_tool/analyzer.py", line 119, in comprehensive_analyze
    sys_response, user_response, session_over, reward = sess.next_turn(
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/session.py", line 124, in next_turn
    self.evaluator.add_sys_da(self.user_agent.get_in_da_eval(), self.sys_agent.dst.state['belief_state'])
  File "/mnt/sda/cjm/ConvLab-3/convlab/evaluator/multiwoz_eval.py", line 192, in add_sys_da
    if not self.booked[domain] and re.match(r'^\d{8}$', value) and \
KeyError: 'booking'

Expected behavior End2End tests adapted to ConvLab3 are needed.

Additional context Add any other context about the feature here.

zqwerty commented 9 months ago

Sorry for the late reply. Yes the RulePolicy is not supported in ConvLab-3 because the mismatch of ontology between ConvLab-2 and ConvLab-3. Have you seen this issue (#152) for end-to-end testing of TUS ?

JamesCao2048 commented 8 months ago

Thanks, this address my problem