Skip to content

Commit

Permalink
Merge branch 'main' into fix-checkpoint-loading
Browse files Browse the repository at this point in the history
  • Loading branch information
maxreciprocate committed Jul 24, 2023
2 parents fb4ea44 + e36fe9d commit d9aa8d2
Show file tree
Hide file tree
Showing 21 changed files with 1,369 additions and 432 deletions.
1 change: 0 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: 3.8
cache: 'pip'

- name: Install dependencies
run: |
Expand Down
26 changes: 21 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The following RL algorithms are currently implemented:

| Algorithm | Accelerate Trainer | NeMo Trainer |
|-------------------------------------------------------------------------------|:------------------:|:-------------:|
| [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1909.08593.pdf) || |
| [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1909.08593.pdf) || |
| [Implicit Language Q-Learning (ILQL)](https://sea-snell.github.io/ILQL_site/) |||

📖 **[Documentation](https://trlX.readthedocs.io)**
Expand Down Expand Up @@ -47,6 +47,8 @@ You can train a model using a reward function or a reward-labeled dataset.
trainer = trlx.train('gpt2', reward_fn=lambda samples, **kwargs: [sample.count('cats') for sample in samples])
```

For **reward model** training refer to our [autocrit](https://github.com/CarperAI/autocrit) library.

#### Using a reward-labeled dataset

```python
Expand All @@ -68,14 +70,28 @@ trainer.generate(**tokenizer('Q: Who rules the world? A:', return_tensors='pt'),
#### Configure Hyperparameters

```python
from trlx.data.default_configs import default_ppo_config, TrainConfig
from trlx.data.default_configs import default_ppo_config

config = default_ppo_config()
config.model.model_path = 'EleutherAI/gpt-neox-20b'
config.train.seq_length = 32
config.train.batch_size = 16
config.tokenizer.tokenizer_path = 'EleutherAI/gpt-neox-20b'
config.train.seq_length = 2048

trainer = trlx.train(config=config, reward_fn=lambda samples, **kwargs: [float(int(sample)) for sample in samples])
trainer = trlx.train(config=config, reward_fn=lambda samples, **kwargs: [len(sample) for sample in samples])
```
To reduce memory usage (if you're experiencing CUDA Out of Memory errors), first try the lowest setting for the following hyperparameters and eventually increase them:
```python
# micro batch size per gpu
config.train.batch_size = 1
# freeze all transformer layers
config.model.num_layers_unfrozen = 0
# maximum sample length, prompts or samples longer than that will be truncated
config.train.seq_length = 128

# micro batch size for sampling (specific for PPO)
config.method.chunk_size = 1
# use an additional Q-head (specific for ILQL)
config.method.two_qs = False
```

#### Save the resulting model to a Hugging Face pretrained language model. (Ready to upload to the Hub!)
Expand Down
75 changes: 75 additions & 0 deletions examples/ppo_dense_sentiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Generates positive movie reviews by tuning a pretrained model on IMDB dataset
# with a sentiment reward function
import json
import os
import sys
from typing import List

import torch
from datasets import load_dataset
from transformers import pipeline

import trlx
from trlx.data.default_configs import TRLConfig, default_ppo_config


def get_positive_score(scores):
"Extract value associated with a positive sentiment from pipeline's output"
return dict(map(lambda x: tuple(x.values()), scores))["POSITIVE"]


def get_negative_score(scores):
return dict(map(lambda x: tuple(x.values()), scores))["NEGATIVE"]


def main(hparams={}):
# Merge sweep config with default config if given
config = TRLConfig.update(default_ppo_config().to_dict(), hparams)

if torch.cuda.is_available():
device = int(os.environ.get("LOCAL_RANK", 0))
else:
device = -1

sentiment_fn = pipeline(
"sentiment-analysis",
"lvwerra/distilbert-imdb",
top_k=2,
truncation=True,
batch_size=256,
device=device,
)

def dense_reward_fn(samples: List[str], prompts: List[str], outputs: List[str], tokenizer, **kwargs) -> List[float]:
# Reward positively for initially negative then positive review
# Reward functions should never receive padded text except for a single EOS at the end
# Reward function should return token rewards for just the response
first_halves = [".".join(sample.split(".")[: len(sample.split(".")) // 2]) for sample in samples]
negative_first_halves = list(map(get_negative_score, sentiment_fn(first_halves)))
second_halves = [".".join(sample.split(".")[len(sample.split(".")) // 2 :]) for sample in samples]
positive_second_halves = list(map(get_positive_score, sentiment_fn(second_halves)))
text_scores = [[f, s] for f, s in zip(negative_first_halves, positive_second_halves)]
tok_scores = []
for sample, prompt, response, text_score in zip(samples, prompts, outputs, text_scores):
toks = tokenizer(response).input_ids
tok_score = [0] * len(toks)
tok_score[len(tok_score) // 2] = text_score[0]
tok_score[-1] = text_score[1]
tok_scores.append(tok_score)
return tok_scores

# Take few words off of movies reviews as prompts
imdb = load_dataset("imdb", split="train+test")
prompts = [" ".join(review.split()[:4]) for review in imdb["text"]]

trlx.train(
reward_fn=dense_reward_fn,
prompts=prompts,
eval_prompts=["I don't know much about Hungarian underground"] * 256,
config=config,
)


if __name__ == "__main__":
hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
main(hparams)
67 changes: 67 additions & 0 deletions examples/ppo_sentiments_peft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Generates positive movie reviews by tuning a pretrained model on IMDB dataset
# with a sentiment reward function
import json
import os
import sys
from typing import List

import torch
from datasets import load_dataset
from peft import LoraConfig
from peft.utils.config import TaskType
from transformers import pipeline

import trlx
from trlx.data.default_configs import TRLConfig, default_ppo_config


def get_positive_score(scores):
"Extract value associated with a positive sentiment from pipeline's output"
return dict(map(lambda x: tuple(x.values()), scores))["POSITIVE"]


def main(hparams={}):
# Merge sweep config with default config if given
config = TRLConfig.update(default_ppo_config().to_dict(), hparams)

if torch.cuda.is_available():
device = int(os.environ.get("LOCAL_RANK", 0))
else:
device = -1

sentiment_fn = pipeline(
"sentiment-analysis",
"lvwerra/distilbert-imdb",
top_k=2,
truncation=True,
batch_size=256,
device=device,
)

# Just insert your peft config here (the type must be an instance of peft.PeftConfig or a dict).
config.model.peft_config = LoraConfig(
r=8,
task_type=TaskType.CAUSAL_LM,
lora_alpha=32,
lora_dropout=0.1,
)

def reward_fn(samples: List[str], **kwargs) -> List[float]:
sentiments = list(map(get_positive_score, sentiment_fn(samples)))
return sentiments

# Take few words off of movies reviews as prompts
imdb = load_dataset("imdb", split="train+test")
prompts = [" ".join(review.split()[:4]) for review in imdb["text"]]

trlx.train(
reward_fn=reward_fn,
prompts=prompts,
eval_prompts=["I don't know much about Hungarian underground"] * 256,
config=config,
)


if __name__ == "__main__":
hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
main(hparams)
4 changes: 2 additions & 2 deletions examples/randomwalks/ppo_randomwalks.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ def main(hparams={}):
trlx.train(
# An "optimality" reward function is used, with scores in [0,1]
# depending on how close the path is to the shortest possible path.
reward_fn=lambda samples, prompts, outputs: metric_fn(samples)["optimality"],
reward_fn=lambda samples, **kwargs: metric_fn(samples)["optimality"],
# The prompts are simply the first nodes (represented as letters) to
# start from.
prompts=prompts,
eval_prompts=prompts,
metric_fn=lambda samples, prompts, outputs: metric_fn(samples),
metric_fn=lambda samples, **kwargs: metric_fn(samples),
config=config,
)

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ numpy==1.24.3
packaging==23.1
pandas==2.0.1
pathtools==0.1.2
peft==0.3.0
pkgutil_resolve_name==1.3.10
platformdirs==3.5.0
protobuf==4.22.3
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[metadata]
name = trlx
author = Alex Havrilla
version = 0.6.0
version = 0.7.0
url = https://github.com/CarperAI/trlx
description = A repo for distributed training of language models with Reinforcement Learning via Human Feedback (RLHF)
long_description = file: README.md
Expand Down
Loading

0 comments on commit d9aa8d2

Please sign in to comment.