Merge branch 'main' into fix-checkpoint-loading

CarperAI · Jul 24, 2023 · d9aa8d2 · d9aa8d2
2 parents fb4ea44 + e36fe9d
commit d9aa8d2
Show file tree

Hide file tree

Showing 21 changed files with 1,369 additions and 432 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -18,7 +18,6 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: 3.8
-        cache: 'pip'
 
     - name: Install dependencies
       run: |

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ The following RL algorithms are currently implemented:
 
 | Algorithm                                                                     | Accelerate Trainer | NeMo Trainer  |
 |-------------------------------------------------------------------------------|:------------------:|:-------------:|
-| [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1909.08593.pdf)    | ✅                 | ⏳            |
+| [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1909.08593.pdf)    | ✅                 | ✅            |
 | [Implicit Language Q-Learning (ILQL)](https://sea-snell.github.io/ILQL_site/) | ✅                 | ✅            |
 
 📖 **[Documentation](https://trlX.readthedocs.io)**
@@ -47,6 +47,8 @@ You can train a model using a reward function or a reward-labeled dataset.
 trainer = trlx.train('gpt2', reward_fn=lambda samples, **kwargs: [sample.count('cats') for sample in samples])
 ```
 
+For **reward model** training refer to our [autocrit](https://github.com/CarperAI/autocrit) library.
+
 #### Using a reward-labeled dataset
 
 ```python
@@ -68,14 +70,28 @@ trainer.generate(**tokenizer('Q: Who rules the world? A:', return_tensors='pt'),
 #### Configure Hyperparameters
 
 ```python
-from trlx.data.default_configs import default_ppo_config, TrainConfig
+from trlx.data.default_configs import default_ppo_config
 
 config = default_ppo_config()
 config.model.model_path = 'EleutherAI/gpt-neox-20b'
-config.train.seq_length = 32
-config.train.batch_size = 16
+config.tokenizer.tokenizer_path = 'EleutherAI/gpt-neox-20b'
+config.train.seq_length = 2048
 
-trainer = trlx.train(config=config, reward_fn=lambda samples, **kwargs: [float(int(sample)) for sample in samples])
+trainer = trlx.train(config=config, reward_fn=lambda samples, **kwargs: [len(sample) for sample in samples])
+```
+To reduce memory usage (if you're experiencing CUDA Out of Memory errors), first try the lowest setting for the following hyperparameters and eventually increase them:
+```python
+# micro batch size per gpu
+config.train.batch_size = 1
+# freeze all transformer layers
+config.model.num_layers_unfrozen = 0
+# maximum sample length, prompts or samples longer than that will be truncated
+config.train.seq_length = 128
+
+# micro batch size for sampling (specific for PPO)
+config.method.chunk_size = 1
+# use an additional Q-head (specific for ILQL)
+config.method.two_qs = False
 ```
 
 #### Save the resulting model to a Hugging Face pretrained language model. (Ready to upload to the Hub!)

diff --git a/examples/ppo_dense_sentiments.py b/examples/ppo_dense_sentiments.py
@@ -0,0 +1,75 @@
+# Generates positive movie reviews by tuning a pretrained model on IMDB dataset
+# with a sentiment reward function
+import json
+import os
+import sys
+from typing import List
+
+import torch
+from datasets import load_dataset
+from transformers import pipeline
+
+import trlx
+from trlx.data.default_configs import TRLConfig, default_ppo_config
+
+
+def get_positive_score(scores):
+    "Extract value associated with a positive sentiment from pipeline's output"
+    return dict(map(lambda x: tuple(x.values()), scores))["POSITIVE"]
+
+
+def get_negative_score(scores):
+    return dict(map(lambda x: tuple(x.values()), scores))["NEGATIVE"]
+
+
+def main(hparams={}):
+    # Merge sweep config with default config if given
+    config = TRLConfig.update(default_ppo_config().to_dict(), hparams)
+
+    if torch.cuda.is_available():
+        device = int(os.environ.get("LOCAL_RANK", 0))
+    else:
+        device = -1
+
+    sentiment_fn = pipeline(
+        "sentiment-analysis",
+        "lvwerra/distilbert-imdb",
+        top_k=2,
+        truncation=True,
+        batch_size=256,
+        device=device,
+    )
+
+    def dense_reward_fn(samples: List[str], prompts: List[str], outputs: List[str], tokenizer, **kwargs) -> List[float]:
+        # Reward positively for initially negative then positive review
+        # Reward functions should never receive padded text except for a single EOS at the end
+        # Reward function should return token rewards for just the response
+        first_halves = [".".join(sample.split(".")[: len(sample.split(".")) // 2]) for sample in samples]
+        negative_first_halves = list(map(get_negative_score, sentiment_fn(first_halves)))
+        second_halves = [".".join(sample.split(".")[len(sample.split(".")) // 2 :]) for sample in samples]
+        positive_second_halves = list(map(get_positive_score, sentiment_fn(second_halves)))
+        text_scores = [[f, s] for f, s in zip(negative_first_halves, positive_second_halves)]
+        tok_scores = []
+        for sample, prompt, response, text_score in zip(samples, prompts, outputs, text_scores):
+            toks = tokenizer(response).input_ids
+            tok_score = [0] * len(toks)
+            tok_score[len(tok_score) // 2] = text_score[0]
+            tok_score[-1] = text_score[1]
+            tok_scores.append(tok_score)
+        return tok_scores
+
+    # Take few words off of movies reviews as prompts
+    imdb = load_dataset("imdb", split="train+test")
+    prompts = [" ".join(review.split()[:4]) for review in imdb["text"]]
+
+    trlx.train(
+        reward_fn=dense_reward_fn,
+        prompts=prompts,
+        eval_prompts=["I don't know much about Hungarian underground"] * 256,
+        config=config,
+    )
+
+
+if __name__ == "__main__":
+    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
+    main(hparams)
diff --git a/examples/ppo_sentiments_peft.py b/examples/ppo_sentiments_peft.py
@@ -0,0 +1,67 @@
+# Generates positive movie reviews by tuning a pretrained model on IMDB dataset
+# with a sentiment reward function
+import json
+import os
+import sys
+from typing import List
+
+import torch
+from datasets import load_dataset
+from peft import LoraConfig
+from peft.utils.config import TaskType
+from transformers import pipeline
+
+import trlx
+from trlx.data.default_configs import TRLConfig, default_ppo_config
+
+
+def get_positive_score(scores):
+    "Extract value associated with a positive sentiment from pipeline's output"
+    return dict(map(lambda x: tuple(x.values()), scores))["POSITIVE"]
+
+
+def main(hparams={}):
+    # Merge sweep config with default config if given
+    config = TRLConfig.update(default_ppo_config().to_dict(), hparams)
+
+    if torch.cuda.is_available():
+        device = int(os.environ.get("LOCAL_RANK", 0))
+    else:
+        device = -1
+
+    sentiment_fn = pipeline(
+        "sentiment-analysis",
+        "lvwerra/distilbert-imdb",
+        top_k=2,
+        truncation=True,
+        batch_size=256,
+        device=device,
+    )
+
+    # Just insert your peft config here (the type must be an instance of peft.PeftConfig or a dict).
+    config.model.peft_config = LoraConfig(
+        r=8,
+        task_type=TaskType.CAUSAL_LM,
+        lora_alpha=32,
+        lora_dropout=0.1,
+    )
+
+    def reward_fn(samples: List[str], **kwargs) -> List[float]:
+        sentiments = list(map(get_positive_score, sentiment_fn(samples)))
+        return sentiments
+
+    # Take few words off of movies reviews as prompts
+    imdb = load_dataset("imdb", split="train+test")
+    prompts = [" ".join(review.split()[:4]) for review in imdb["text"]]
+
+    trlx.train(
+        reward_fn=reward_fn,
+        prompts=prompts,
+        eval_prompts=["I don't know much about Hungarian underground"] * 256,
+        config=config,
+    )
+
+
+if __name__ == "__main__":
+    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
+    main(hparams)
diff --git a/examples/randomwalks/ppo_randomwalks.py b/examples/randomwalks/ppo_randomwalks.py
@@ -59,12 +59,12 @@ def main(hparams={}):
     trlx.train(
         # An "optimality" reward function is used, with scores in [0,1]
         # depending on how close the path is to the shortest possible path.
-        reward_fn=lambda samples, prompts, outputs: metric_fn(samples)["optimality"],
+        reward_fn=lambda samples, **kwargs: metric_fn(samples)["optimality"],
         # The prompts are simply the first nodes (represented as letters) to
         # start from.
         prompts=prompts,
         eval_prompts=prompts,
-        metric_fn=lambda samples, prompts, outputs: metric_fn(samples),
+        metric_fn=lambda samples, **kwargs: metric_fn(samples),
         config=config,
     )
 

diff --git a/requirements.txt b/requirements.txt
@@ -43,6 +43,7 @@ numpy==1.24.3
 packaging==23.1
 pandas==2.0.1
 pathtools==0.1.2
+peft==0.3.0
 pkgutil_resolve_name==1.3.10
 platformdirs==3.5.0
 protobuf==4.22.3

diff --git a/setup.cfg b/setup.cfg
@@ -1,7 +1,7 @@
 [metadata]
 name = trlx
 author = Alex Havrilla
-version = 0.6.0
+version = 0.7.0
 url = https://github.com/CarperAI/trlx
 description = A repo for distributed training of language models with Reinforcement Learning via Human Feedback (RLHF)
 long_description = file: README.md