microsoft · pclucas14 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 30, 2024
diff --git a/mttl/models/containers/selectors/poly_selector.py b/mttl/models/containers/selectors/poly_selector.py
@@ -256,3 +256,36 @@ def on_add_expert(
             for name in self.module_logits_dict.keys():
                 self.module_logits_dict[name].data = torch.ones(1).to(self.device)
                 self.module_logits_dict[name].data /= len(self.module_logits_dict)
+
+@dataclass
+class VectorSelectorConfig(SelectorConfig):
+    task_names: List[str] = None
+
+
+@Selector.register("vector_router", VectorSelectorConfig)
+class VectorSelector(Selector):
+    """
+    User can specify a distribution over the skills.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.register_buffer("dummy", torch.ones(1))
+
+    @forward_with_cache
+    def forward(self, input, **kwargs) -> ExpertsSplitsAndWeightsSelectorOutput:
+
+        routing_infos = self.routing_infos
+        assert (
+            routing_infos.skill_mixing_coefs is not None
+        ), "No skill mixing coefs found"
+
+        mixing_coefs = routing_infos.skill_mixing_coefs.to(self.dummy.device)
+
+        if mixing_coefs.ndim == 1:
+            mixing_coefs = mixing_coefs.unsqueeze(0)
+
+        return ExpertsSplitsAndWeightsSelectorOutput(
+            SelectorOutput.ALL_EXPERTS, mixing_coefs
+        )
diff --git a/mttl/models/expert_context.py b/mttl/models/expert_context.py
@@ -78,3 +78,28 @@ def wrapper_func(model, **kwargs):
             return results
 
         return wrapper_func
+
+    @classmethod
+    def wrap_with_context(cls, f):
+        """
+         Decorator method that wraps a general function of a model class
+        (We may want to wrap other methods than just forward and generate).
+        Use `create_context` whenever possible
+        """
+        from mttl.models.modifiers.routing import RoutingInfo
+
+        @functools.wraps(f)
+        def wrapper_func(model, *args, **kwargs):
+
+            return_context = kwargs.pop("return_context", False)
+            with cls(model, RoutingInfo.from_batch(args[0])) as context:
+                results = f(model, *args, **kwargs)
+                if return_context:
+                    context_returns = {
+                        "routing_infos": context.routing_infos,
+                        "routing_gates": context.routing_gates,
+                    }
+                    return results, context_returns
+            return results
+
+        return wrapper_func
diff --git a/mttl/models/modifiers/routing.py b/mttl/models/modifiers/routing.py
@@ -20,6 +20,7 @@ class RoutingInfo:
     packed_seq_lens: List[int] = None
     seq_lens: List[int] = None
     packed_attn_mask: torch.Tensor = None
+    skill_mixing_coefs: torch.Tensor = None
 
     @classmethod
     def pop_elements(cls, batch, keep=None):
@@ -59,6 +60,7 @@ def from_batch(cls, batch: dict, **kwargs):
             packed_seq_lens=batch.get("packed_seq_lens", None),
             seq_lens=batch.get("seq_lens", None),
             packed_attn_mask=batch.get("packed_attn_mask", None),
+            skill_mixing_coefs=batch.get("skill_mixing_coefs", None),
             **kwargs,
         )
         return ri