[Maintenance] Outdated end2end evaluation script

Describe the feature A clear and concise description of what the feature is. The end2end evaluation scripts in examples/agent_examples/ seem to be directly copied from ConvLab2, thus have some problems. For example,

The RulePolicy seems not to be supported in ConvLab3 now.
The default dataset ofPPOPolicy in test_BERTNLU-RuleDST-PPOPolicy-TemplateNLG.py is Multiwoz, which is not supported in ConvLab3.

There is a Getting_Started.ipynb point in tutorials/Getting_Started.ipynb, but it only provides an interactive test, not an end2end test. I add a user pipeline agent to it as below,

from convlab.base_models.t5.nlu import T5NLU
from convlab.base_models.t5.dst import T5DST
from convlab.base_models.t5.nlg import T5NLG
from convlab.nlu.jointBERT.multiwoz import BERTNLU
from convlab.policy.vector.vector_nodes import VectorNodes
from convlab.policy.rule.multiwoz import RulePolicy
from convlab.policy.vtrace_DPT import VTRACE
from convlab.dialog_agent import PipelineAgent, BiSession
from convlab.evaluator.multiwoz_eval import MultiWozEvaluator
from convlab.nlg.template.multiwoz import TemplateNLG
from pprint import pprint
import random
import numpy as np
import torch
from convlab.util.analysis_tool.analyzer import Analyzer
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from convlab.policy.tus.multiwoz.TUS import UserPolicy
from convlab.dst.rule.multiwoz.usr_dst import UserRuleDST
import json

def set_seed(r_seed):
    random.seed(r_seed)
    np.random.seed(r_seed)
    torch.manual_seed(r_seed)

def test_end2end(seed=20200202, n_dialogues=1000):
    # go to README.md of each model for more information
    sys_nlu = T5NLU(speaker='user', context_window_size=0, model_name_or_path='../../models/t5-small-nlu-multiwoz21')
    sys_dst = T5DST(dataset_name='multiwoz21', speaker='user', context_window_size=100, model_name_or_path='../../models/t5-small-dst-multiwoz21')
    # Download pre-trained DDPT model
    # ! wget https://huggingface.co/ConvLab/ddpt-policy-multiwoz21/resolve/main/supervised.pol.mdl --directory-prefix="convlab/policy/vtrace_DPT"
    vectorizer = VectorNodes(dataset_name='multiwoz21',
                            use_masking=True,
                            manually_add_entity_names=True,
                            seed=0,
                            filter_state=True)
    sys_policy = VTRACE(is_train=False,
                seed=0,
                vectorizer=vectorizer,
                load_path="convlab/policy/vtrace_DPT/supervised")
    sys_nlg = T5NLG(speaker='system', context_window_size=0, model_name_or_path='../../models/t5-small-nlg-multiwoz21')
    # assemble
    sys_agent = PipelineAgent(sys_nlu, sys_dst, sys_policy, sys_nlg, name='sys')

     # specify the user config
    user_config = "/mnt/sda/cjm/ConvLab-3/convlab/policy/tus/multiwoz/exp/default.json"
    user_mode = ""
    # BERT nlu trained on sys utterance
    user_nlu = BERTNLU(mode='sys', config_file='multiwoz_sys_context.json',
                       model_file='/mnt/sda/cjm/ConvLab-3/models/bert_multiwoz_sys_context.zip')
    user_dst = None
    # rule policy
    user_config = json.load(open(user_config))

    if user_mode:
        user_config["model_name"] = f"{user_config['model_name']}-{user_mode}"
    user_policy = UserPolicy(user_config)
    # user_policy =  RulePolicy(character='usr')
    # template NLG
    user_nlg = TemplateNLG(is_user=True)
    # assemble
    user_agent = PipelineAgent(
        user_nlu, user_dst, user_policy, user_nlg, name='user')

    analyzer = Analyzer(user_agent=user_agent, dataset='multiwoz')

    set_seed(seed)
    name=f'BERTNLU-RuleDST-RulePolicy-TemplateNLG-Seed{seed}'
    analyzer.comprehensive_analyze(sys_agent=sys_agent, model_name=name, total_dialog=n_dialogues)

if __name__ == '__main__':
    # Get arguments
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', help='Seed', default=20200202, type=int)
    parser.add_argument('--n_dialogues', help='Number of eval dialogues', default=1000, type=int)
    args = parser.parse_args()

    test_end2end(seed=args.seed, n_dialogues=args.n_dialogues)

,but got this error

  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 86, in <module>
    test_end2end(seed=args.seed, n_dialogues=args.n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 77, in test_end2end
    analyzer.comprehensive_analyze(sys_agent=sys_agent, model_name=name, total_dialog=n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/convlab/util/analysis_tool/analyzer.py", line 119, in comprehensive_analyze
    sys_response, user_response, session_over, reward = sess.next_turn(
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/session.py", line 122, in next_turn
    user_response = self.next_response(last_observation)
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/session.py", line 96, in next_response
    response = next_agent.response(observation)
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/agent.py", line 176, in response
    self.output_action = deepcopy(self.policy.predict(state))
  File "/mnt/sda/cjm/ConvLab-3/convlab/policy/tus/multiwoz/TUS.py", line 420, in predict
    return self.policy.predict(state)
  File "/mnt/sda/cjm/ConvLab-3/convlab/policy/tus/multiwoz/TUS.py", line 81, in predict
    sys_dialog_act = state["system_action"]
TypeError: list indices must be integers or slices, not str

If I change the user policy to RulePolicy, I met this error,

Traceback (most recent call last):
  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 86, in <module>
    test_end2end(seed=args.seed, n_dialogues=args.n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/examples/cjm_examples/getting_started.py", line 77, in test_end2end
    analyzer.comprehensive_analyze(sys_agent=sys_agent, model_name=name, total_dialog=n_dialogues)
  File "/mnt/sda/cjm/ConvLab-3/convlab/util/analysis_tool/analyzer.py", line 119, in comprehensive_analyze
    sys_response, user_response, session_over, reward = sess.next_turn(
  File "/mnt/sda/cjm/ConvLab-3/convlab/dialog_agent/session.py", line 124, in next_turn
    self.evaluator.add_sys_da(self.user_agent.get_in_da_eval(), self.sys_agent.dst.state['belief_state'])
  File "/mnt/sda/cjm/ConvLab-3/convlab/evaluator/multiwoz_eval.py", line 192, in add_sys_da
    if not self.booked[domain] and re.match(r'^\d{8}$', value) and \
KeyError: 'booking'

Expected behavior End2End tests adapted to ConvLab3 are needed.

Additional context Add any other context about the feature here.

ConvLab / ConvLab-3

[Maintenance] Outdated end2end evaluation script #186