Merge pull request #9 from parklab/dev

Version 0.3.2
parklab · Jan 3, 2024 · 71104f4 · 71104f4
2 parents 88cd6e8 + d4bd71f
commit 71104f4
Show file tree

Hide file tree

Showing 12 changed files with 218 additions and 156 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ---
 ---
 
+## 0.3.2 - 2024-01
+### Fixed
+  - Support fixing the model variance of (multimodal) CorrNMF models during training
+
 ## 0.3.1 - 2023-12
 ### Fixed
   - Improve signature, history and embedding plots
@@ -16,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
   - Support a sample-weigted KL-divergence loss in KL-NMF
   - Support a sample-weighted sparsity regularization in KL-NMF
-  - Support fixing signature and sample biases in (multimodal) CorrNMF during inference
+  - Support fixing signature and sample biases of (multimodal) CorrNMF models during training
 
 ## 0.2.1 - 2023-10
 ### Fixed
@@ -25,7 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## 0.2.0 - 2023-10
 ### Added
-  - Support fixing arbitrary many a priori known signatures during inference
+  - Support fixing arbitrary many a priori known signatures during model training
   - Improved performance with just-in-time compiled update rules
 
 ## 0.1.0 - 2023-10

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "salamander-learn"
-version = "0.3.1"
+version = "0.3.2"
 description = "Salamander is a non-negative matrix factorization framework for signature analysis"
 license = "MIT"
 authors = ["Benedikt Geiger"]

diff --git a/src/salamander/__init__.py b/src/salamander/__init__.py
@@ -8,7 +8,7 @@
 from .nmf_framework.mvnmf import MvNMF
 from .plot import set_salamander_style
 
-__version__ = "0.3.1"
+__version__ = "0.3.2"
 __all__ = ["CorrNMFDet", "KLNMF", "MvNMF", "MultimodalCorrNMF"]
 
 

diff --git a/src/salamander/nmf_framework/corrnmf.py b/src/salamander/nmf_framework/corrnmf.py
@@ -16,11 +16,10 @@
 class CorrNMF(SignatureNMF):
     r"""
     The abstract class CorrNMF unifies the structure of deterministic and
-    stochastic correlated NMF (CorrNMF) with and without given signatures.
-    Both variants of CorrNMF have an identical generative model and objective function.
-    The model parameters are the sample biases \alpha, variance \sigma^2,
-    signature matrix W and the auxiliary parameters p.
-    The latent variables are the signature embeddings L and the sample embeddings U.
+    stochastic algorithms to fit the parameters of correlated NMF (CorrNMF).
+
+    The model parameters are the signature and sample biases, the variance, and the
+    signature matrix. The latent variables are the signature and sample embeddings.
 
     Overview:
 
@@ -48,12 +47,10 @@ class CorrNMF(SignatureNMF):
             update a single sample embedding u
 
         - fit:
-            Run CorrNMF for a given mutation count data. Every
-            fit method should also implement a version that allows fixing
-            arbitrary many a priori known signatures.
+            Run CorrNMF for a given mutation count data.
 
 
-    The following attributes are implemented in the abstract class CNMF:
+    The following attributes are implemented in CorrNMF:
 
         - signatures: pd.DataFrame
             The signature matrix including mutation type names and signature names
@@ -73,10 +70,7 @@ class CorrNMF(SignatureNMF):
             The number of parameters fitted in CorrNMF
 
         - objective: str
-            "minimize" or "maximize". Whether the NMF algorithm maximizes
-            or minimizes the objective function. Some algorithms maximize a likelihood,
-            others minimize a distance. The distinction is useful for filtering NMF
-            runs based on the fitted objective function value.
+            "minimize" or "maximize". CorrNMF maximizes the objective function.
 
         - corr_signatures: pd.DataFrame
             The signature correlation matrix induced by the signature embeddings
@@ -85,7 +79,7 @@ class CorrNMF(SignatureNMF):
             The sample correlation matrix induced by the sample embeddings
 
 
-    The following methods are implemented in the abstract class CorrNMF:
+    The following methods are implemented in CorrNMF:
 
         - objective_function:
             The evidence lower bound (ELBO) of the log-likelihood.
@@ -103,12 +97,12 @@ class CorrNMF(SignatureNMF):
             Initialize all model parameters and latent variables depending on the
             initialization method chosen
 
-        - _get_embedding_annotations:
-            A helper function to concatenate signature and sample names
+        - _get_embedding_data:
+            A helper function for the embedding plot that returns the signature
+            and sample embeddings
 
-        - plot_embeddings:
-            Plot signature or sample embeddings in 2D using PCA, tSNE or UMAP.
-            The respective plotting functions are implemented in the plot.py module
+        - _get_default_embedding_annotations:
+            A helper function for the embedding plot that returns the signature names
 
     More specific docstrings are written for the respective attributes and methods.
     """
@@ -202,26 +196,6 @@ def exposures(self) -> pd.DataFrame:
         )
         return exposures
 
-    @property
-    def _n_parameters(self):
-        """
-        There are n_features * n_signatures parameters corresponding to
-        the signature matrix, each embedding corresponds to dim_embeddings parameters,
-        and each signature & sample has a bias.
-        Finally, the model variance is a single positive real number.
-
-        Note: We do not include the number of auxiliary parameters p.
-        """
-        n_parameters_signatures = self.n_features * self.n_signatures
-        n_parameters_embeddings = self.dim_embeddings * (
-            self.n_signatures + self.n_samples
-        )
-        n_parameters_biases = self.n_samples + self.n_signatures
-        n_parameters_exposures = n_parameters_embeddings + n_parameters_biases
-        n_parameters = n_parameters_signatures + n_parameters_exposures + 1
-
-        return n_parameters
-
     @property
     def reconstruction_error(self):
         return kl_divergence(self.X, self.W, self.exposures.values)
@@ -290,6 +264,24 @@ def _surrogate_objective_function(self, penalize_sample_embeddings=True) -> floa
     def loglikelihood(self):
         return self.objective_function()
 
+    @property
+    def _n_parameters(self):
+        """
+        There are n_features * n_signatures parameters corresponding to
+        the signature matrix, each embedding corresponds to dim_embeddings parameters,
+        and each signature & sample has a real valued bias.
+        Finally, the model variance is a single positive real number.
+        """
+        n_parameters_signatures = self.n_features * self.n_signatures
+        n_parameters_embeddings = self.dim_embeddings * (
+            self.n_signatures + self.n_samples
+        )
+        n_parameters_biases = self.n_samples + self.n_signatures
+        n_parameters_exposures = n_parameters_embeddings + n_parameters_biases
+        n_parameters = n_parameters_signatures + n_parameters_exposures + 1
+
+        return n_parameters
+
     @abstractmethod
     def _update_alpha(self):
         pass
@@ -363,6 +355,7 @@ def _check_given_parameters(
         given_signature_embeddings,
         given_sample_biases,
         given_sample_embeddings,
+        given_variance,
     ):
         if given_signatures is not None:
             self._check_given_signatures(given_signatures)
@@ -389,18 +382,24 @@ def _check_given_parameters(
                 given_sample_embeddings, self.n_samples, "given_sample_embeddings"
             )
 
+        if given_variance is not None:
+            type_checker("given_variance", given_variance, [float, int])
+            if given_variance <= 0.0:
+                raise ValueError("The variance has to be a positive real number.")
+
     def _initialize(
         self,
         given_signatures=None,
         given_signature_biases=None,
         given_signature_embeddings=None,
         given_sample_biases=None,
         given_sample_embeddings=None,
+        given_variance=None,
         init_kwargs=None,
     ):
         """
         Initialize the signature matrix W, sample biases alpha, signature biases beta,
-        the squared variance, and the signature and sample embeddings.
+        the variance, and the signature and sample embeddings.
         The signatures or signature embeddings can also be provided by the user.
 
         Parameters
@@ -425,6 +424,9 @@ def _initialize(
         given_sample_embeddings : np.ndarray, default=None
             A priori known sample embeddings of shape (dim_embeddings, n_samples).
 
+        given_variance : float, default=None
+            A priori known model variance of the embeddings.
+
         init_kwargs : dict
             Any further keyword arguments to pass to the initialization method.
             This includes, for example, a possible 'seed' keyword argument
@@ -436,6 +438,7 @@ def _initialize(
             given_signature_embeddings,
             given_sample_biases,
             given_sample_embeddings,
+            given_variance,
         )
 
         if given_signatures is not None:
@@ -447,7 +450,11 @@ def _initialize(
         self.W, _, self.signature_names = initialize(
             self.X, self.n_signatures, self.init_method, given_signatures, **init_kwargs
         )
-        self.sigma_sq = 1.0
+
+        if given_variance is None:
+            self.sigma_sq = 1.0
+        else:
+            self.sigma_sq = float(given_variance)
 
         if given_signature_biases is None:
             self.beta = np.zeros(self.n_signatures)
@@ -457,7 +464,7 @@ def _initialize(
         if given_signature_embeddings is None:
             self.L = np.random.multivariate_normal(
                 np.zeros(self.dim_embeddings),
-                np.identity(self.dim_embeddings),
+                self.sigma_sq * np.identity(self.dim_embeddings),
                 size=self.n_signatures,
             ).T
         else:
@@ -471,7 +478,7 @@ def _initialize(
         if given_sample_embeddings is None:
             self.U = np.random.multivariate_normal(
                 np.zeros(self.dim_embeddings),
-                np.identity(self.dim_embeddings),
+                self.sigma_sq * np.identity(self.dim_embeddings),
                 size=self.n_samples,
             ).T
         else:

diff --git a/src/salamander/nmf_framework/corrnmf_det.py b/src/salamander/nmf_framework/corrnmf_det.py
@@ -20,17 +20,16 @@ class CorrNMFDet(CorrNMF):
     "Bayesian Nonnegative Matrix Factorization with Stochastic Variational
     Inference" by Paisley et al.
 
-    The following methods are implemented to match the structure
-    of the abstract class CorrNMF:
+    The following methods are implemented to match the structure of CorrNMF:
 
         - _update_alpha:
-            update the sample exposure biases \alpha
+            update the sample biases \alpha
 
         - _update_beta:
             update the signature biases \beta
 
         - _update_sigma_sq:
-            update the variance \sigma^2 assumed in the generative model
+            update the variance \sigma^2
 
         - _update_W:
             update the signature matrix W
@@ -47,8 +46,7 @@ class CorrNMFDet(CorrNMF):
     The following method is implemented to match the structure of SignatureNMF:
 
         - fit:
-            Perform CorrNMF for the given mutation count data or
-            for given signatures and mutation count data
+            Inference of the CorrNMF parameters for a given mutation count data
     """
 
     def _update_alpha(self, given_sample_biases=None):
@@ -61,10 +59,11 @@ def _update_beta(self, p, given_signature_biases=None):
                 self.X, p, self.alpha, self.L, self.U
             )
 
-    def _update_sigma_sq(self):
-        embeddings = np.concatenate([self.L, self.U], axis=1)
-        self.sigma_sq = np.mean(embeddings**2)
-        self.sigma_sq = np.clip(self.sigma_sq, EPSILON, None)
+    def _update_sigma_sq(self, given_variance=None):
+        if given_variance is None:
+            embeddings = np.concatenate([self.L, self.U], axis=1)
+            self.sigma_sq = np.mean(embeddings**2)
+            self.sigma_sq = np.clip(self.sigma_sq, EPSILON, None)
 
     def _update_W(self):
         self.W = update_W(
@@ -193,6 +192,7 @@ def fit(
         given_signature_embeddings=None,
         given_sample_biases=None,
         given_sample_embeddings=None,
+        given_variance=None,
         init_kwargs=None,
         history=False,
         verbose=0,
@@ -219,9 +219,12 @@ def fit(
             Known sample biases of shape (n_samples,) that will be fixed
             during model fitting.
 
-        given_sample_embeddings: np.ndarray, default=None
+        given_sample_embeddings : np.ndarray, default=None
             Known sample embeddings that will be fixed during model fitting.
 
+        given_variance : float, default=None
+            Known model variance that will be fixed during model fitting.
+
         init_kwargs: dict
             Any further keywords arguments to be passed to the initialization method.
             This includes, for example, a possible 'seed' keyword argument
@@ -246,6 +249,7 @@ def fit(
             given_signature_embeddings=given_signature_embeddings,
             given_sample_biases=given_sample_biases,
             given_sample_embeddings=given_sample_embeddings,
+            given_variance=given_variance,
             init_kwargs=init_kwargs,
         )
         of_values = [self.objective_function()]
@@ -262,7 +266,7 @@ def fit(
             p = self._update_p()
             self._update_beta(p, given_signature_biases)
             self._update_LU(p, given_signature_embeddings, given_sample_embeddings)
-            self._update_sigma_sq()
+            self._update_sigma_sq(given_variance)
 
             if self.n_given_signatures < self.n_signatures:
                 self._update_W()

diff --git a/src/salamander/nmf_framework/klnmf.py b/src/salamander/nmf_framework/klnmf.py
@@ -11,9 +11,11 @@
 class KLNMF(NMF):
     """
     Decompose a mutation count matrix X into the product of a signature
-    matrix W and an exposure matrix H by minimizing the generalized
-    Kullback-Leibler (KL) divergence under the constraint of having
-    normalized signatures.
+    matrix W and an exposure matrix H by minimizing the weighted
+    generalized Kullback-Leibler (KL) divergence under the constraint of
+    having normalized signatures.
+    The implementation supports a sparstiy-inducing l_half penalty of the
+    exposures.
 
     Parameters
     ----------