bigcode-project · infinitylogesh · Jan 23, 2023 · Jan 23, 2023 · Jan 25, 2023 · Jan 26, 2023
diff --git a/README.md b/README.md
@@ -68,14 +68,15 @@ accelerate launch  main.py \
   --temperature <TEMPERATURE> \
   --do_sample True \
   --n_samples 100 \
+  --num_return_sequences 20 \
   --batch_size 10 \
   --allow_code_execution=False 
 ```
 * `limit` represents the number of problems to solve, if it's not provided all problems in the benchmark are selected. 
 * `allow_code_execution` is for executing the generated code: read the displayed warning before setting it to `True`. 
 
 Some tasks don't require code execution such as
-`codexglue_code_to_text-<LANGUAGE>`/`codexglue_code_to_text-python-left`/`conala`/`concode` that use BLEU evaluation. In addition, we generate one candidate solution for each problem in these tasks, so use `n_samples=1` and `batch_size=1`. (Note that `batch_size` should always be equal or less than `n_samples`).
+`codexglue_code_to_text-<LANGUAGE>`/`codexglue_code_to_text-python-left`/`conala`/`concode` that use BLEU evaluation. In addition, we generate one candidate solution for each problem in these tasks, so use `n_samples=1` and `num_return_sequences=1`. (Note that `num_return_sequences` should always be equal or less than `n_samples`).
 * For APPS tasks, you can use `n_samples=1` for strict and average accuracies (from the original APPS paper) and `n_samples>1` for pass@k.
 
 ### Generation only

diff --git a/docs/README.md b/docs/README.md
@@ -43,6 +43,7 @@ accelerate launch  main.py \
   --tasks humaneval \
   --temperature 0.2 \
   --n_samples 200 \
+  --num_return_sequences 20 \
   --batch_size 10 \
   --allow_code_execution=False 
 ```
@@ -70,6 +71,7 @@ accelerate launch  main.py \
   --tasks mbpp \
   --temperature 0.1 \
   --n_samples 15 \
+  --num_return_sequences 15 \
   --batch_size 10 \
   --allow_code_execution=False \
 ```
@@ -139,7 +141,7 @@ accelerate launch  main.py \
   --tasks apps-introductory \
   --n_samples 1 \
   --temperature 0.1 \
-  --batch_size 1 \
+  --batch_size 5 \
   --allow_code_execution=False 
 ```
 We expect a model [finetuned](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/finetuning/APPS) on the train split of APPS.

diff --git a/lm_eval/arguments.py b/lm_eval/arguments.py
@@ -41,3 +41,9 @@ class EvalArguments:
     seed: Optional[int] = field(
         default=0, metadata={"help": "Random seed used for evaluation."}
     )
+    num_return_sequences: Optional[int] = field(
+        default=1,
+        metadata={
+            "help":"The number of independently computed return sequences for each element in the batch"
+            }
+    )
diff --git a/lm_eval/generation.py b/lm_eval/generation.py
@@ -62,7 +62,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
 
     if accelerator.is_main_process:
         print(f"number of problems for this task is {n_tasks}")
-    n_copies = args.n_samples // args.batch_size
+    n_copies = args.n_samples // args.num_return_sequences
 
     ds_tokenized = TokenizedDataset(
         task,
@@ -76,7 +76,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
     )
 
     # do not confuse args.batch_size, which is actually the num_return_sequences
-    ds_loader = DataLoader(ds_tokenized, batch_size=1)
+    ds_loader = DataLoader(ds_tokenized, batch_size=args.batch_size)
 
     model, ds_loader = accelerator.prepare(model, ds_loader)
     generations = complete_code(
@@ -86,7 +86,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
         tokenizer,
         ds_loader,
         n_tasks=n_tasks,
-        batch_size=args.batch_size,
+        num_return_sequences=args.num_return_sequences,
         prefix=args.prefix,
         postprocess=args.postprocess,
         **gen_kwargs,

diff --git a/lm_eval/utils.py b/lm_eval/utils.py
@@ -49,7 +49,7 @@ def __iter__(self):
         if self.n_copies == 1 and self.n_tasks % self.num_devices != 0:
             self.n_copies = 2
             warnings.warn(
-                "n_copies (n_samples/batch_size) was changed from 1 to 2 because n_tasks isn't proportional to num devices"
+                "n_copies (n_samples/num_return_sequences) was changed from 1 to 2 because n_tasks isn't proportional to num devices"
             )
 
         for sample in range(self.n_tasks):
@@ -58,6 +58,7 @@ def __iter__(self):
                     "ids": outputs.input_ids[sample],
                     "task_id": sample,
                     "input_len": outputs.attention_mask[sample].sum(),
+                    "attention_mask": outputs.attention_mask[sample],
                 }
 
 
@@ -68,7 +69,7 @@ def complete_code(
     tokenizer,
     dataloader,
     n_tasks,
-    batch_size=20,
+    num_return_sequences=20,
     prefix="",
     postprocess=True,
     **gen_kwargs,
@@ -84,13 +85,19 @@ def complete_code(
         with torch.no_grad():
             if task.stop_words:
                 gen_kwargs["stopping_criteria"][0].start_length = batch["ids"].shape[-1]
+
+            if batch["ids"].shape[0]==1:
+                batch["ids"] = batch["ids"][:,:batch["input_len"]]
+                batch["attention_mask"] = batch["attention_mask"][:,:batch["input_len"]]
+
             generated_tokens = accelerator.unwrap_model(model).generate(
-                input_ids=batch["ids"][:, : batch["input_len"]],
-                num_return_sequences=batch_size,
+                input_ids=batch["ids"],
+                attention_mask=batch["attention_mask"],
+                num_return_sequences=num_return_sequences,
                 **gen_kwargs,
             )
-            # each task is generated batch_size times
-            generated_tasks = batch["task_id"].repeat(batch_size)
+            # each task is generated num_return_sequences times
+            generated_tasks = batch["task_id"].repeat(num_return_sequences)
             generated_tokens = accelerator.pad_across_processes(
                 generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
             )

diff --git a/main.py b/main.py
@@ -151,6 +151,15 @@ def main():
                 print("bos_token used as eos_token")
             else:
                 raise ValueError("No eos_token or bos_token found")
+
+        if args.n_samples < args.num_return_sequences:
+            raise ValueError("n_samples should always be equal or greater than num_return_sequences ")
+
+        # When padding_side = "right",Padding tokens are considered during decoding. 
+        # so setting it to left - to ignore padding tokens while decoding, as per 
+        # https://github.com/huggingface/transformers/pull/7552
+        if args.batch_size > 1:
+            tokenizer.padding_side = "left"
         tokenizer.pad_token = tokenizer.eos_token
         evaluator = Evaluator(accelerator, model, tokenizer, args)
 

diff --git a/tests/test_generation_evaluation.py b/tests/test_generation_evaluation.py
@@ -86,3 +86,25 @@ def test_evaluation():
         results = evaluator.evaluate(task)
         assert results == {"pass@1": 0.25}
     print("passed eval")
+
+def test_multi_batch_generation():
+    args.n_samples = 1
+    args.batch_size = 2
+    args.limit = 2
+    args.do_sample = False
+    args.generation_only = True
+    args.generations_path = None
+    # Increasing the max_lenth to accomadate pad tokens 
+    # in the final generation
+    args.max_length_generation=356
+    tokenizer.padding_side = "left"
+    evaluator = Evaluator(accelerator, model, tokenizer, args)
+    for task in TASKS:
+        print(f"testing task {task}")
+        generations, references = evaluator.generate_text(task)
+        true_gens, true_refs = load_generation_examples(task)
+        # capping the generation to the max length of true gens
+        for idx,tg in enumerate(true_gens):
+            generations[idx][0] = generations[idx][0][:len(tg[0])]
+        assert generations == true_gens
+        assert references == true_refs