Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speculative decoding using hf #3

Open
wants to merge 3 commits into
base: speculative-decoding
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions server/text_generation_server/models/paged_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,12 +321,10 @@ def __init__(
from fms_extras.utils.cache.paged import PagedKVCacheManager

if SPECULATOR_PATH is not None:
from fms_extras.models.speculator import MLPSpeculator
from fms_extras.models.hf.modeling_mlp_speculator import MLPSpeculatorPreTrainedModel
print(f"Speculation will be enabled up to batch size {SPECULATOR_MAX_BATCH_SIZE}")
self.speculator = MLPSpeculator(model_config.hidden_size, vocab_size=model_config.vocab_size, n_predict=3).to(device=self.device, dtype=dtype)
self.speculator.load_state_dict(
torch.load(SPECULATOR_PATH, map_location=self.device)["model_state"]
)
self.speculator = MLPSpeculatorPreTrainedModel.from_pretrained(SPECULATOR_PATH)
self.speculator.to(device=self.device, dtype=dtype)
else:
self.speculator = None

Expand All @@ -340,7 +338,6 @@ def __init__(
device=self.device,
)


@property
def batch_type(self) -> Type[PagedCausalLMBatch]:
return self._batch_type
Expand Down
4 changes: 2 additions & 2 deletions server/text_generation_server/utils/paged.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def prepare_inputs_with_speculation(
n_adds = speculator.n_predict + 1

#hard-code some values
top_k = 5
threshes=[5, 3, 2]
top_k = speculator.config.n_candidates
threshes= speculator.config.top_k_tokens_per_head
flatting=True

# create candidate sequences
Expand Down