diff --git a/bigcode_eval/tasks/custom_metrics/multiple_metrics/containerized_eval.py b/bigcode_eval/tasks/custom_metrics/multiple_metrics/containerized_eval.py index ab0ac8c1e..8df32e62a 100644 --- a/bigcode_eval/tasks/custom_metrics/multiple_metrics/containerized_eval.py +++ b/bigcode_eval/tasks/custom_metrics/multiple_metrics/containerized_eval.py @@ -36,6 +36,7 @@ "jl": (eval_julia.eval_script, ".jl"), "ts": (eval_ts.eval_script, ".ts"), "go": (eval_go.eval_script, ".go"), + "go_test.go": (eval_go.eval_script, "_test.go"), "pl": (eval_pl.eval_script, ".pl"), "sh": (eval_sh.eval_script, ".sh"), "scala": (eval_scala.eval_script, ".scala"), diff --git a/bigcode_eval/tasks/multiple.py b/bigcode_eval/tasks/multiple.py index 7f514426d..ee43f526d 100644 --- a/bigcode_eval/tasks/multiple.py +++ b/bigcode_eval/tasks/multiple.py @@ -138,7 +138,7 @@ def process_results(self, generations, references): """ # get prompts and problem names prompts_names = [ - {"prompt": doc["prompt"], "name": doc["name"]} + {"prompt": doc["prompt"], "name": doc["name"], "lang": doc["language"]} for i, doc in enumerate(self.get_dataset()) if i < len(generations) ] @@ -150,7 +150,7 @@ def process_results(self, generations, references): ): problem = { "name": prompt_name["name"], - "language": self.language, + "language": prompt_name["lang"], "prompt": prompt_name["prompt"], "completions": generation, "tests": reference,