Skip to content

Commit

Permalink
Output MoE merges as single dtype
Browse files Browse the repository at this point in the history
  • Loading branch information
cg123 committed Dec 30, 2023
1 parent 503e740 commit 58d9b97
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
26 changes: 21 additions & 5 deletions mergekit/scripts/mixtral_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing_extensions import Annotated

import mergekit.architecture
from mergekit.common import ModelReference
from mergekit.common import ModelReference, dtype_from_name
from mergekit.io import LazyTensorLoader, TensorWriter

# Create a Mixtral MoE from a set of equally-sized Mistral (or Llama) models.
Expand All @@ -44,6 +44,7 @@ class MistralMOEConfig(BaseModel):
# "hidden" uses hidden state vectors for the given prompts for each layer
# "cheap_embed" uses the average of token embeddings for the prompts, same for each layer
# "random" is random
dtype: Optional[str] = None


def get_hidden_states(
Expand Down Expand Up @@ -184,10 +185,21 @@ def build(
base_loader = loaders.get(base_model)
writer = TensorWriter(out_path=out_path)

if config.dtype:
out_dtype = dtype_from_name(config.dtype)
elif base_cfg.torch_dtype:
out_dtype = dtype_from_name(base_cfg.torch_dtype)
else:
out_dtype = None

print("Copying parameters...")
MISTRAL_INFO = mergekit.architecture.MISTRAL_INFO
for tensor_name in MISTRAL_INFO.pre_weight_names + MISTRAL_INFO.post_weight_names:
writer.save_tensor(tensor_name, base_loader.get_tensor(tensor_name))
tensor = base_loader.get_tensor(tensor_name)
if not out_dtype:
# All else has failed, take the first dtype we see
out_dtype = tensor.dtype
writer.save_tensor(tensor_name, tensor.to(dtype=out_dtype))

for name_format in tqdm.tqdm(MISTRAL_INFO.layer_weight_formats()):
for layer_idx in range(base_cfg.num_hidden_layers):
Expand All @@ -208,9 +220,13 @@ def build(
tensor = expert_loader.get_tensor(tensor_name)
if expert.noise_scale:
tensor += torch.randn_like(tensor) * expert.noise_scale
writer.save_tensor(expert_name, tensor, clone=True)
writer.save_tensor(
expert_name, tensor.to(dtype=out_dtype), clone=True
)
continue
writer.save_tensor(tensor_name, base_loader.get_tensor(tensor_name))
writer.save_tensor(
tensor_name, base_loader.get_tensor(tensor_name).to(dtype=out_dtype)
)

tokenizer = transformers.AutoTokenizer.from_pretrained(base_model.path)
tokenizer.padding_side = "left"
Expand All @@ -232,7 +248,7 @@ def build(
for layer_idx in range(base_cfg.num_hidden_layers):
writer.save_tensor(
f"model.layers.{layer_idx}.block_sparse_moe.gate.weight",
gate_vecs[layer_idx, :, :].contiguous(),
gate_vecs[layer_idx, :, :].contiguous().to(dtype=out_dtype),
)
writer.finalize()
print("Saving tokenizer...")
Expand Down
1 change: 1 addition & 0 deletions moe.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
```yml
base_model: path/to/self_attn_donor
gate_mode: hidden # one of "hidden", "cheap_embed", or "random"
dtype: bfloat16 # output dtype (float32, float16, or bfloat16)
experts:
- source_model: expert_model_1
positive_prompts:
Expand Down

0 comments on commit 58d9b97

Please sign in to comment.