Add the function to calculate unweighted entropy similarity.

YuanyueLi · Jun 25, 2024 · dbecc06 · dbecc06
1 parent 0ec97f4
commit dbecc06
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 14 deletions.
diff --git a/ms_entropy/entropy_search/flash_entropy_search.py b/ms_entropy/entropy_search/flash_entropy_search.py
@@ -9,19 +9,38 @@
 
 
 class FlashEntropySearch:
-    def __init__(self, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, low_memory=False, path_data=None):
+    def __init__(
+        self,
+        max_ms2_tolerance_in_da=0.024,
+        mz_index_step=0.0001,
+        low_memory=False,
+        path_data=None,
+        intensity_weight="entropy",
+    ):
+        """
+        Initialize the EntropySearch class.
+        :param max_ms2_tolerance_in_da:  The maximum MS2 tolerance in Da.
+        :param mz_index_step:   The step size for the m/z index.
+        :param low_memory:  The memory usage mode, can be 0, 1, or 2. 0 means normal mode, 1 means low memory mode, and 2 means medium memory mode.
+        :param path_data:   The path to save the index data.
+        :param intensity_weight:    The weight for the intensity in the entropy calculation, can be "entropy" or None. Default is "entropy".
+            - None: The intensity will not be weighted, then the unweighted similarity will be calculated.
+            - "entropy": The intensity will be weighted by the entropy, then the entropy similarity will be calculated.
+        """
         self.precursor_mz_array = np.zeros(0, dtype=np.float32)
         self.low_memory = low_memory
-        if low_memory==1:
+        if low_memory == 1:
             self.entropy_search = FlashEntropySearchCoreLowMemory(
-                path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
+                path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight
             )
-        elif low_memory==2:
+        elif low_memory == 2:
             self.entropy_search = FlashEntropySearchCoreMediumMemory(
-                path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
+                path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight
             )
         else:
-            self.entropy_search = FlashEntropySearchCore(path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
+            self.entropy_search = FlashEntropySearchCore(
+                path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight
+            )
 
     def identity_search(self, precursor_mz, peaks, ms1_tolerance_in_da, ms2_tolerance_in_da, target="cpu", output_matched_peak_number=False, **kwargs):
         """

diff --git a/ms_entropy/entropy_search/flash_entropy_search_core.py b/ms_entropy/entropy_search/flash_entropy_search_core.py
@@ -9,17 +9,26 @@
 
 
 class FlashEntropySearchCore:
-    def __init__(self, path_data=None, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
+    def __init__(
+        self,
+        path_data=None,
+        max_ms2_tolerance_in_da=0.024,
+        mz_index_step=0.0001,
+        intensity_weight="entropy", # "entropy" or None
+    ) -> None:
         """
         Initialize the EntropySearch class.
         :param path_array: The path array of the index files.
         :param max_ms2_tolerance_in_da: The maximum MS2 tolerance used when searching the MS/MS spectra, in Dalton. Default is 0.024.
         :param mz_index_step:   The step size of the m/z index, in Dalton. Default is 0.0001.
                                 The smaller the step size, the faster the search, but the larger the index size and longer the index building time.
+        :param intensity_weight: The weight of the intensity, can be "entropy" or None. If set to "entropy", the intensity will be weighted by the entropy.
+                                If set to None, the intensity will not be weighted, which is equivalent to the unweighted entropy similarity.
         """
         self.mz_index_step = mz_index_step
         self._init_for_multiprocessing = False
         self.max_ms2_tolerance_in_da = max_ms2_tolerance_in_da
+        self.intensity_weight = intensity_weight
 
         self.total_spectra_num = 0
         self.total_peaks_num = 0
@@ -511,8 +520,13 @@ def _preprocess_peaks(self, peaks):
         """
         Preprocess the peaks.
         """
-        peaks_clean = np.asarray(apply_weight_to_intensity(peaks))
-        peaks_clean[:, 1] /= 2
+        if self.intensity_weight == "entropy":
+            peaks_clean = np.asarray(apply_weight_to_intensity(peaks))
+            peaks_clean[:, 1] /= 2
+        elif self.intensity_weight is None:
+            peaks_clean = peaks.copy()
+            peaks_clean[:, 1] /= 2
+
         return peaks_clean
 
     def _score_peaks_with_cpu(self, intensity_query, intensity_library):

diff --git a/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py b/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py
@@ -6,8 +6,8 @@
 
 
 class FlashEntropySearchCoreLowMemory(FlashEntropySearchCore):
-    def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
-        super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
+    def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, intensity_weight="entropy") -> None:
+        super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight)
         self.path_data = Path(str(path_data))
         self.path_data.mkdir(parents=True, exist_ok=True)
         self.index_file = []

diff --git a/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py b/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py
@@ -6,8 +6,8 @@
 
 
 class FlashEntropySearchCoreMediumMemory(FlashEntropySearchCore):
-    def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
-        super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
+    def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, intensity_weight="entropy") -> None:
+        super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight)
         self.path_data = Path(str(path_data))
         self.path_data.mkdir(parents=True, exist_ok=True)
 

diff --git a/ms_entropy/version.py b/ms_entropy/version.py
@@ -1 +1 @@
-__version__ = '1.2.2'
+__version__ = '1.3.0'
diff --git a/tests/test_entropy_search.py b/tests/test_entropy_search.py
@@ -77,6 +77,54 @@ def test_identity_search(self):
         np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5)
 
 
+class TestUnweightedFlashEntropySearchWithCpu(unittest.TestCase):
+    def setUp(self):
+        spectral_library = [
+            {"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 10.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)},
+            {
+                "id": "Demo spectrum 2",
+                "precursor_mz": 220.0,
+                "peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32),
+            },
+            {
+                "id": "Demo spectrum 3",
+                "precursor_mz": 250.0,
+                "peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32),
+            },
+            {"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]},
+        ]
+        query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 10.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}
+
+        self.flash_entropy = FlashEntropySearch(intensity_weight=None)
+        self.flash_entropy.build_index(spectral_library)
+        query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"])
+        self.query_spectrum = query_spectrum
+
+    def test_open_search(self):
+        similarity = self.flash_entropy.open_search(peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02)
+        np.testing.assert_almost_equal(similarity, [1.0, 0.356, 0.118, 0.474], decimal=3)
+        similarity, matched_peaks = self.flash_entropy.open_search(
+            peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02, output_matched_peak_number=True
+        )
+        np.testing.assert_almost_equal(similarity, [1.0, 0.356, 0.118, 0.474], decimal=3)
+        np.testing.assert_almost_equal(matched_peaks, [4, 1, 1, 2], decimal=5)
+
+    def test_identity_search(self):
+        similarity = self.flash_entropy.identity_search(
+            precursor_mz=self.query_spectrum["precursor_mz"], peaks=self.query_spectrum["peaks"], ms1_tolerance_in_da=0.01, ms2_tolerance_in_da=0.02
+        )
+        np.testing.assert_almost_equal(similarity, [1.0, 0.0, 0.0, 0.0], decimal=5)
+        similarity, matched_peaks = self.flash_entropy.identity_search(
+            precursor_mz=self.query_spectrum["precursor_mz"],
+            peaks=self.query_spectrum["peaks"],
+            ms1_tolerance_in_da=0.01,
+            ms2_tolerance_in_da=0.02,
+            output_matched_peak_number=True,
+        )
+        np.testing.assert_almost_equal(similarity, [1.0, 0.0, 0.0, 0.0], decimal=5)
+        np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5)
+
+
 class TestFlashEntropySearchWithCpuLowMemory(TestFlashEntropySearchWithCpu):
     def setUp(self):
         spectral_library = [