Add the function of for medium memory.

Close opened file in low memory mode.
YuanyueLi · May 22, 2024 · 0390ac0 · 0390ac0
1 parent 7be7594
commit 0390ac0
Show file tree

Hide file tree

Showing 8 changed files with 187 additions and 11 deletions.
diff --git a/docs/source/entropy_search_advanced_usage.rst b/docs/source/entropy_search_advanced_usage.rst
@@ -8,7 +8,12 @@ Run Flash entropy search with limited memory
 
 This method is useful when you are dealing with a very large spectral library and your computer's memory is limited.
 
-To achieve this, while constructing the ``FlashEntropySearch`` object, you need to set the ``path_data`` parameter to the path of the index file, and set the ``low_memory`` parameter to ``True``. Then read the pre-built index file by calling the ``read`` method. After that, the rest of the code is the same as usual.
+To achieve this, while constructing the ``FlashEntropySearch`` object, you need to set the ``path_data`` parameter to the path of the index file, and set the ``low_memory`` parameter to ``1`` or ``2``. Then read the pre-built index file by calling the ``read`` method. After that, the rest of the code is the same as usual.
+
+The ``low_memory`` parameter has three values:
+    - False or 0: Normal mode. All the index will be loaded into memory, this is the default mode and the fastest mode.
+    - True or 1: Low memory mode. Only load the nessessary data into memory, this mode needs the lowest memory, but the search speed will be the slowest, as it will read all the data from the disk every time.
+    - 2: Low memory mode use memmap. This mode is similar to mode 1, but it will use the ``numpy.memmap`` to map the index file to memory, which will be faster than mode 1 if the memory is not too small.
 
 .. code-block:: python
 
@@ -17,14 +22,14 @@ To achieve this, while constructing the ``FlashEntropySearch`` object, you need
     # Instead of using this:
     # entropy_search = FlashEntropySearch()
     # Use this:
-    entropy_search = FlashEntropySearch(path_data='path/to/library/index', low_memory=True)
+    entropy_search = FlashEntropySearch(path_data='path/to/library/index', low_memory=1)
     entropy_search.read()
 
     # Then the reset of the code is the same as usual.
     # entropy_search.search(...)
     # ...... (the reset of the code is the same as usual)
 
-The index built in normal mode and low memory mode is identical. If you use our ``write`` and ``read`` methods to save and load the index, you can use the index in normal mode and low memory mode interchangeably. For example, you can build the index in normal mode, save it to disk with the ``write`` method. After that, you can initialize the ``FlashEntropySearch`` object with ``path_data`` parameter which points to the index file, and set ``low_memory`` parameter to ``True``, then call the ``read`` method to load the index, and proceed with the search as usual.
+The index built in normal mode and low memory mode is identical. If you use our ``write`` and ``read`` methods to save and load the index, you can use the index in normal mode and low memory mode interchangeably. For example, you can build the index in normal mode, save it to disk with the ``write`` method. After that, you can initialize the ``FlashEntropySearch`` object with ``path_data`` parameter which points to the index file, and set ``low_memory`` parameter to ``1``, then call the ``read`` method to load the index, and proceed with the search as usual.
 
 
 Run Flash entropy search with multiple cores

diff --git a/ms_entropy/__init__.py b/ms_entropy/__init__.py
@@ -5,8 +5,6 @@
     calculate_unweighted_entropy_similarity,
     apply_weight_to_intensity,
 )
-from .file_io import (
-    read_one_spectrum, standardize_spectrum
-)
-from .entropy_search import FlashEntropySearch, FlashEntropySearchCore, FlashEntropySearchCoreLowMemory
+from .file_io import read_one_spectrum, standardize_spectrum
+from .entropy_search import FlashEntropySearch, FlashEntropySearchCore, FlashEntropySearchCoreLowMemory, FlashEntropySearchCoreMediumMemory
 from .version import __version__
diff --git a/ms_entropy/entropy_search/__init__.py b/ms_entropy/entropy_search/__init__.py
@@ -1,3 +1,4 @@
 from .flash_entropy_search import FlashEntropySearch
 from .flash_entropy_search_core import FlashEntropySearchCore
-from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory
+from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory
+from .flash_entropy_search_core_medium_memory import FlashEntropySearchCoreMediumMemory
diff --git a/ms_entropy/entropy_search/flash_entropy_search.py b/ms_entropy/entropy_search/flash_entropy_search.py
@@ -4,17 +4,22 @@
 from pathlib import Path
 from .flash_entropy_search_core import FlashEntropySearchCore
 from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory
+from .flash_entropy_search_core_medium_memory import FlashEntropySearchCoreMediumMemory
 from ..spectra import clean_spectrum
 
 
 class FlashEntropySearch:
     def __init__(self, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, low_memory=False, path_data=None):
         self.precursor_mz_array = np.zeros(0, dtype=np.float32)
         self.low_memory = low_memory
-        if low_memory:
+        if low_memory==1:
             self.entropy_search = FlashEntropySearchCoreLowMemory(
                 path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
             )
+        elif low_memory==2:
+            self.entropy_search = FlashEntropySearchCoreMediumMemory(
+                path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
+            )
         else:
             self.entropy_search = FlashEntropySearchCore(path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
 

diff --git a/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py b/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py
@@ -12,6 +12,10 @@ def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.000
         self.path_data.mkdir(parents=True, exist_ok=True)
         self.index_file = []
 
+    def __del__(self):
+        for file in self.index_file:
+            file.close()
+
     def _generate_index_from_peak_data(self, peak_data, max_indexed_mz, append):
         total_peaks_num = peak_data.shape[0]
 

diff --git a/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py b/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import json
+import numpy as np
+from pathlib import Path
+from .flash_entropy_search_core import FlashEntropySearchCore
+
+
+class FlashEntropySearchCoreMediumMemory(FlashEntropySearchCore):
+    def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
+        super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
+        self.path_data = Path(str(path_data))
+        self.path_data.mkdir(parents=True, exist_ok=True)
+
+    def _generate_index_from_peak_data(self, peak_data, max_indexed_mz, append):
+        total_peaks_num = peak_data.shape[0]
+
+        # Sort with precursor m/z.
+        peak_data.sort(order="ion_mz")
+
+        # Record the m/z, intensity, and spectrum index information for product ions.
+        (peak_data["ion_mz"]).tofile(self.path_data / "all_ions_mz.npy")
+        (peak_data["intensity"]).tofile(self.path_data / "all_ions_intensity.npy")
+        (peak_data["spec_idx"]).tofile(self.path_data / "all_ions_spec_idx.npy")
+
+        # all_ions_mz = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 0], np.float32, "all_ions_mz")
+        # all_ions_intensity = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 2], np.float32, "all_ions_intensity")
+        # all_ions_spec_idx = self._convert_view_to_array(peak_data.view(np.uint32).reshape(total_peaks_num, -1)[:, 3], np.uint32, "all_ions_spec_idx")
+
+        # Assign the index of the product ions.
+        peak_data["peak_idx"] = np.arange(0, self.total_peaks_num, dtype=np.uint64)
+
+        # Build index for fast access to the ion's m/z.
+        all_ions_mz = np.memmap(self.path_data / "all_ions_mz.npy", dtype=np.float32, mode="r", shape=(total_peaks_num,))
+        max_mz = min(np.max(all_ions_mz), max_indexed_mz)
+        search_array = np.arange(0.0, max_mz, self.mz_index_step)
+        all_ions_mz_idx_start = np.searchsorted(all_ions_mz, search_array, side="left").astype(np.int64)
+        all_ions_mz_idx_start.tofile(self.path_data / "all_ions_mz_idx_start.npy")
+
+        ############## Step 3: Build the index by sort with neutral loss mass. ##############
+        # Sort with the neutral loss mass.
+        peak_data.sort(order="nl_mass")
+
+        # Record the m/z, intensity, spectrum index, and product ions index information for neutral loss ions.
+        (peak_data["nl_mass"]).tofile(self.path_data / "all_nl_mass.npy")
+        (peak_data["intensity"]).tofile(self.path_data / "all_nl_intensity.npy")
+        (peak_data["spec_idx"]).tofile(self.path_data / "all_nl_spec_idx.npy")
+        (peak_data["peak_idx"]).tofile(self.path_data / "all_ions_idx_for_nl.npy")
+
+        # all_nl_mass = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 1], np.float32, "all_nl_mass")
+        # all_nl_intensity = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 2], np.float32, "all_nl_intensity")
+        # all_nl_spec_idx = self._convert_view_to_array(peak_data.view(np.uint32).reshape(total_peaks_num, -1)[:, 3], np.uint32, "all_nl_spec_idx")
+        # all_ions_idx_for_nl = self._convert_view_to_array(peak_data.view(np.uint64).reshape(total_peaks_num, -1)[:, 2], np.uint64, "all_ions_idx_for_nl")
+
+        # Build the index for fast access to the neutral loss mass.
+        all_nl_mass = np.memmap(self.path_data / "all_nl_mass.npy", dtype=np.float32, mode="r", shape=(total_peaks_num,))
+        max_mz = min(np.max(all_nl_mass), max_indexed_mz)
+        search_array = np.arange(0.0, max_mz, self.mz_index_step)
+        all_nl_mass_idx_start = np.searchsorted(all_nl_mass, search_array, side="left").astype(np.int64)
+        all_nl_mass_idx_start.tofile(self.path_data / "all_nl_mass_idx_start.npy")
+
+        ############## Step 4: Save the index. ##############
+        self.write()
+        self.read()
+        return self.index
+
+    def read(self, path_data=None):
+        """
+        Read the index from the file.
+        """
+        if path_data is not None:
+            self.path_data = Path(path_data)
+
+        try:
+            self.index = []
+            for name in self.index_names:
+                self.index.append(np.memmap(self.path_data / f"{name}.npy", dtype=self.index_dtypes[name], mode="r"))
+
+            with open(self.path_data / "information.json", "r") as f:
+                information = json.load(f)
+            self.mz_index_step = information["mz_index_step"]
+            self.total_spectra_num = information["total_spectra_num"]
+            self.total_peaks_num = information["total_peaks_num"]
+            self.max_ms2_tolerance_in_da = information["max_ms2_tolerance_in_da"]
+            return True
+        except:
+            return False
+
+    def write(self, path_data=None):
+        """
+        Write the index to the file.
+        """
+        if path_data is not None:
+            assert Path(path_data) == self.path_data, "The path_data is not the same as the path_data in the class."
+
+        information = {
+            "mz_index_step": float(self.mz_index_step),
+            "total_spectra_num": int(self.total_spectra_num),
+            "total_peaks_num": int(self.total_peaks_num),
+            "max_ms2_tolerance_in_da": float(self.max_ms2_tolerance_in_da),
+        }
+        json.dump(information, open(self.path_data / "information.json", "w"))
diff --git a/ms_entropy/version.py b/ms_entropy/version.py
@@ -1 +1 @@
-__version__ = '1.1.3'
+__version__ = '1.2.0'
diff --git a/tests/test_entropy_search.py b/tests/test_entropy_search.py
@@ -57,7 +57,7 @@ def test_open_search(self):
         np.testing.assert_almost_equal(similarity, [1.0, 0.22299, 0.22299, 0.44598], decimal=5)
         similarity, matched_peaks = self.flash_entropy.open_search(
             peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02, output_matched_peak_number=True
-        )   
+        )
         np.testing.assert_almost_equal(similarity, [1.0, 0.22299, 0.22299, 0.44598], decimal=5)
         np.testing.assert_almost_equal(matched_peaks, [4, 1, 1, 2], decimal=5)
 
@@ -77,6 +77,68 @@ def test_identity_search(self):
         np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5)
 
 
+class TestFlashEntropySearchWithCpuLowMemory(TestFlashEntropySearchWithCpu):
+    def setUp(self):
+        spectral_library = [
+            {"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)},
+            {
+                "id": "Demo spectrum 2",
+                "precursor_mz": 220.0,
+                "peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32),
+            },
+            {
+                "id": "Demo spectrum 3",
+                "precursor_mz": 250.0,
+                "peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32),
+            },
+            {"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]},
+        ]
+        query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}
+
+        self.flash_entropy = FlashEntropySearch()
+        self.flash_entropy.build_index(spectral_library)
+        path_test = tempfile.mkdtemp()
+        self.flash_entropy.write(path_test)
+        self.flash_entropy = FlashEntropySearch(low_memory=1)
+        self.flash_entropy.read(path_test)
+        query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"])
+        self.query_spectrum = query_spectrum
+
+    def test_read_and_write(self):
+        pass
+
+
+class TestFlashEntropySearchWithCpuMediumMemory(TestFlashEntropySearchWithCpu):
+    def setUp(self):
+        spectral_library = [
+            {"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)},
+            {
+                "id": "Demo spectrum 2",
+                "precursor_mz": 220.0,
+                "peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32),
+            },
+            {
+                "id": "Demo spectrum 3",
+                "precursor_mz": 250.0,
+                "peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32),
+            },
+            {"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]},
+        ]
+        query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}
+
+        self.flash_entropy = FlashEntropySearch()
+        self.flash_entropy.build_index(spectral_library)
+        path_test = tempfile.mkdtemp()
+        self.flash_entropy.write(path_test)
+        self.flash_entropy = FlashEntropySearch(low_memory=2)
+        self.flash_entropy.read(path_test)
+        query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"])
+        self.query_spectrum = query_spectrum
+
+    def test_read_and_write(self):
+        pass
+
+
 # class TestFlashEntropySearchWithGpu(TestFlashEntropySearchWithCpu):
 #     def test_hybrid_search(self):
 #         similarity = self.flash_entropy.hybrid_search(precursor_mz=self.query_spectrum['precursor_mz'],