stanfordnlp / dspy

DSPy: The framework for programming—not prompting—foundation models
https://dspy-docs.vercel.app/
MIT License
16.87k stars 1.3k forks source link

[Suggestion] Consider enhancing the Evaluate class to support callable objects as metrics #429

Open lzjever opened 7 months ago

lzjever commented 7 months ago

Can we change the line of code in the Evaluate class below:


        # Rename the 'correct' column to the name of the metric function
        #metric_name = metric.__name__
        assert(callable(metric))
        metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__

So we can write code like this:


class APISelectorAssessor:
    def __init__(self,tolarete_num_FP=4):
        self.error_preds = []
        self.tolarete_num_FP = tolarete_num_FP

    def save_error_preds(self,fn='error_preds.json'):
        with open(fn, 'w') as f:
            json5.dump(self.error_preds, f, intend=4,ensure_ascii=False)

    def __call__(self, example, pred, trace=None, frac=1.0):
        def extract_list_from_string(s):
            match = re.search(r"\[([^\]]*)\]", s)
            if match:
                list_str = match.group(1)
                list_str = '[%s]' % list_str
                return list_str
            else:
                return []
        try:
            clean_list_text = extract_list_from_string(pred.selected_api_list)
            pred_selected_api_list = json5.loads(clean_list_text)

            if example.selected_api in pred_selected_api_list and (len(pred_selected_api_list) <= (self.tolarete_num_FP + 1)):
                rv = 1  # 1 - (len(pred_selected_api_list) - 1) / 5
                return rv
            else:
                self.error_preds.append({'question': example.question, 'selected_api': example.selected_api, 'pred': str(pred)})
                return 0.0
        except Exception as e:
            print(e)
            self.error_preds.append({'question': example.question, 'selected_api': example.selected_api, 'pred': str(pred)})
            return 0.0

assessor = APISelectorAssessor()
evaluate_api_selector = Evaluate(devset=dataset_api_calling_gt_sample[13:18]  , metric=assessor, num_threads=1, display_progress=True, display_table=0)
evaluate_api_selector(compiled_prompt_opt)
okhat commented 6 months ago

@lzjever Yes 100%. Do you want to make this PR? If it's just changing 4-5 lines I'll merge it quickly