diff --git a/docs/source/entropy_search_advanced_usage.rst b/docs/source/entropy_search_advanced_usage.rst index 3796141..6dc8d6f 100644 --- a/docs/source/entropy_search_advanced_usage.rst +++ b/docs/source/entropy_search_advanced_usage.rst @@ -8,7 +8,12 @@ Run Flash entropy search with limited memory This method is useful when you are dealing with a very large spectral library and your computer's memory is limited. -To achieve this, while constructing the ``FlashEntropySearch`` object, you need to set the ``path_data`` parameter to the path of the index file, and set the ``low_memory`` parameter to ``True``. Then read the pre-built index file by calling the ``read`` method. After that, the rest of the code is the same as usual. +To achieve this, while constructing the ``FlashEntropySearch`` object, you need to set the ``path_data`` parameter to the path of the index file, and set the ``low_memory`` parameter to ``1`` or ``2``. Then read the pre-built index file by calling the ``read`` method. After that, the rest of the code is the same as usual. + +The ``low_memory`` parameter has three values: + - False or 0: Normal mode. All the index will be loaded into memory, this is the default mode and the fastest mode. + - True or 1: Low memory mode. Only load the nessessary data into memory, this mode needs the lowest memory, but the search speed will be the slowest, as it will read all the data from the disk every time. + - 2: Low memory mode use memmap. This mode is similar to mode 1, but it will use the ``numpy.memmap`` to map the index file to memory, which will be faster than mode 1 if the memory is not too small. .. code-block:: python @@ -17,14 +22,14 @@ To achieve this, while constructing the ``FlashEntropySearch`` object, you need # Instead of using this: # entropy_search = FlashEntropySearch() # Use this: - entropy_search = FlashEntropySearch(path_data='path/to/library/index', low_memory=True) + entropy_search = FlashEntropySearch(path_data='path/to/library/index', low_memory=1) entropy_search.read() # Then the reset of the code is the same as usual. # entropy_search.search(...) # ...... (the reset of the code is the same as usual) -The index built in normal mode and low memory mode is identical. If you use our ``write`` and ``read`` methods to save and load the index, you can use the index in normal mode and low memory mode interchangeably. For example, you can build the index in normal mode, save it to disk with the ``write`` method. After that, you can initialize the ``FlashEntropySearch`` object with ``path_data`` parameter which points to the index file, and set ``low_memory`` parameter to ``True``, then call the ``read`` method to load the index, and proceed with the search as usual. +The index built in normal mode and low memory mode is identical. If you use our ``write`` and ``read`` methods to save and load the index, you can use the index in normal mode and low memory mode interchangeably. For example, you can build the index in normal mode, save it to disk with the ``write`` method. After that, you can initialize the ``FlashEntropySearch`` object with ``path_data`` parameter which points to the index file, and set ``low_memory`` parameter to ``1``, then call the ``read`` method to load the index, and proceed with the search as usual. Run Flash entropy search with multiple cores diff --git a/ms_entropy/__init__.py b/ms_entropy/__init__.py index 3726c30..03e55ed 100644 --- a/ms_entropy/__init__.py +++ b/ms_entropy/__init__.py @@ -5,8 +5,6 @@ calculate_unweighted_entropy_similarity, apply_weight_to_intensity, ) -from .file_io import ( - read_one_spectrum, standardize_spectrum -) -from .entropy_search import FlashEntropySearch, FlashEntropySearchCore, FlashEntropySearchCoreLowMemory +from .file_io import read_one_spectrum, standardize_spectrum +from .entropy_search import FlashEntropySearch, FlashEntropySearchCore, FlashEntropySearchCoreLowMemory, FlashEntropySearchCoreMediumMemory from .version import __version__ diff --git a/ms_entropy/entropy_search/__init__.py b/ms_entropy/entropy_search/__init__.py index b2a1318..4e7c7fa 100644 --- a/ms_entropy/entropy_search/__init__.py +++ b/ms_entropy/entropy_search/__init__.py @@ -1,3 +1,4 @@ from .flash_entropy_search import FlashEntropySearch from .flash_entropy_search_core import FlashEntropySearchCore -from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory \ No newline at end of file +from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory +from .flash_entropy_search_core_medium_memory import FlashEntropySearchCoreMediumMemory \ No newline at end of file diff --git a/ms_entropy/entropy_search/flash_entropy_search.py b/ms_entropy/entropy_search/flash_entropy_search.py index 1e82d49..7969dc8 100644 --- a/ms_entropy/entropy_search/flash_entropy_search.py +++ b/ms_entropy/entropy_search/flash_entropy_search.py @@ -4,6 +4,7 @@ from pathlib import Path from .flash_entropy_search_core import FlashEntropySearchCore from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory +from .flash_entropy_search_core_medium_memory import FlashEntropySearchCoreMediumMemory from ..spectra import clean_spectrum @@ -11,10 +12,14 @@ class FlashEntropySearch: def __init__(self, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, low_memory=False, path_data=None): self.precursor_mz_array = np.zeros(0, dtype=np.float32) self.low_memory = low_memory - if low_memory: + if low_memory==1: self.entropy_search = FlashEntropySearchCoreLowMemory( path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step ) + elif low_memory==2: + self.entropy_search = FlashEntropySearchCoreMediumMemory( + path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step + ) else: self.entropy_search = FlashEntropySearchCore(path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step) diff --git a/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py b/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py index bf488bd..bb1b9d5 100644 --- a/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py +++ b/ms_entropy/entropy_search/flash_entropy_search_core_low_memory.py @@ -12,6 +12,10 @@ def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.000 self.path_data.mkdir(parents=True, exist_ok=True) self.index_file = [] + def __del__(self): + for file in self.index_file: + file.close() + def _generate_index_from_peak_data(self, peak_data, max_indexed_mz, append): total_peaks_num = peak_data.shape[0] diff --git a/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py b/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py new file mode 100644 index 0000000..d92a5fe --- /dev/null +++ b/ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +import json +import numpy as np +from pathlib import Path +from .flash_entropy_search_core import FlashEntropySearchCore + + +class FlashEntropySearchCoreMediumMemory(FlashEntropySearchCore): + def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None: + super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step) + self.path_data = Path(str(path_data)) + self.path_data.mkdir(parents=True, exist_ok=True) + + def _generate_index_from_peak_data(self, peak_data, max_indexed_mz, append): + total_peaks_num = peak_data.shape[0] + + # Sort with precursor m/z. + peak_data.sort(order="ion_mz") + + # Record the m/z, intensity, and spectrum index information for product ions. + (peak_data["ion_mz"]).tofile(self.path_data / "all_ions_mz.npy") + (peak_data["intensity"]).tofile(self.path_data / "all_ions_intensity.npy") + (peak_data["spec_idx"]).tofile(self.path_data / "all_ions_spec_idx.npy") + + # all_ions_mz = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 0], np.float32, "all_ions_mz") + # all_ions_intensity = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 2], np.float32, "all_ions_intensity") + # all_ions_spec_idx = self._convert_view_to_array(peak_data.view(np.uint32).reshape(total_peaks_num, -1)[:, 3], np.uint32, "all_ions_spec_idx") + + # Assign the index of the product ions. + peak_data["peak_idx"] = np.arange(0, self.total_peaks_num, dtype=np.uint64) + + # Build index for fast access to the ion's m/z. + all_ions_mz = np.memmap(self.path_data / "all_ions_mz.npy", dtype=np.float32, mode="r", shape=(total_peaks_num,)) + max_mz = min(np.max(all_ions_mz), max_indexed_mz) + search_array = np.arange(0.0, max_mz, self.mz_index_step) + all_ions_mz_idx_start = np.searchsorted(all_ions_mz, search_array, side="left").astype(np.int64) + all_ions_mz_idx_start.tofile(self.path_data / "all_ions_mz_idx_start.npy") + + ############## Step 3: Build the index by sort with neutral loss mass. ############## + # Sort with the neutral loss mass. + peak_data.sort(order="nl_mass") + + # Record the m/z, intensity, spectrum index, and product ions index information for neutral loss ions. + (peak_data["nl_mass"]).tofile(self.path_data / "all_nl_mass.npy") + (peak_data["intensity"]).tofile(self.path_data / "all_nl_intensity.npy") + (peak_data["spec_idx"]).tofile(self.path_data / "all_nl_spec_idx.npy") + (peak_data["peak_idx"]).tofile(self.path_data / "all_ions_idx_for_nl.npy") + + # all_nl_mass = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 1], np.float32, "all_nl_mass") + # all_nl_intensity = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 2], np.float32, "all_nl_intensity") + # all_nl_spec_idx = self._convert_view_to_array(peak_data.view(np.uint32).reshape(total_peaks_num, -1)[:, 3], np.uint32, "all_nl_spec_idx") + # all_ions_idx_for_nl = self._convert_view_to_array(peak_data.view(np.uint64).reshape(total_peaks_num, -1)[:, 2], np.uint64, "all_ions_idx_for_nl") + + # Build the index for fast access to the neutral loss mass. + all_nl_mass = np.memmap(self.path_data / "all_nl_mass.npy", dtype=np.float32, mode="r", shape=(total_peaks_num,)) + max_mz = min(np.max(all_nl_mass), max_indexed_mz) + search_array = np.arange(0.0, max_mz, self.mz_index_step) + all_nl_mass_idx_start = np.searchsorted(all_nl_mass, search_array, side="left").astype(np.int64) + all_nl_mass_idx_start.tofile(self.path_data / "all_nl_mass_idx_start.npy") + + ############## Step 4: Save the index. ############## + self.write() + self.read() + return self.index + + def read(self, path_data=None): + """ + Read the index from the file. + """ + if path_data is not None: + self.path_data = Path(path_data) + + try: + self.index = [] + for name in self.index_names: + self.index.append(np.memmap(self.path_data / f"{name}.npy", dtype=self.index_dtypes[name], mode="r")) + + with open(self.path_data / "information.json", "r") as f: + information = json.load(f) + self.mz_index_step = information["mz_index_step"] + self.total_spectra_num = information["total_spectra_num"] + self.total_peaks_num = information["total_peaks_num"] + self.max_ms2_tolerance_in_da = information["max_ms2_tolerance_in_da"] + return True + except: + return False + + def write(self, path_data=None): + """ + Write the index to the file. + """ + if path_data is not None: + assert Path(path_data) == self.path_data, "The path_data is not the same as the path_data in the class." + + information = { + "mz_index_step": float(self.mz_index_step), + "total_spectra_num": int(self.total_spectra_num), + "total_peaks_num": int(self.total_peaks_num), + "max_ms2_tolerance_in_da": float(self.max_ms2_tolerance_in_da), + } + json.dump(information, open(self.path_data / "information.json", "w")) diff --git a/ms_entropy/version.py b/ms_entropy/version.py index 7bb021e..58d478a 100644 --- a/ms_entropy/version.py +++ b/ms_entropy/version.py @@ -1 +1 @@ -__version__ = '1.1.3' +__version__ = '1.2.0' diff --git a/tests/test_entropy_search.py b/tests/test_entropy_search.py index 07868fe..16f2bea 100644 --- a/tests/test_entropy_search.py +++ b/tests/test_entropy_search.py @@ -57,7 +57,7 @@ def test_open_search(self): np.testing.assert_almost_equal(similarity, [1.0, 0.22299, 0.22299, 0.44598], decimal=5) similarity, matched_peaks = self.flash_entropy.open_search( peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02, output_matched_peak_number=True - ) + ) np.testing.assert_almost_equal(similarity, [1.0, 0.22299, 0.22299, 0.44598], decimal=5) np.testing.assert_almost_equal(matched_peaks, [4, 1, 1, 2], decimal=5) @@ -77,6 +77,68 @@ def test_identity_search(self): np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5) +class TestFlashEntropySearchWithCpuLowMemory(TestFlashEntropySearchWithCpu): + def setUp(self): + spectral_library = [ + {"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}, + { + "id": "Demo spectrum 2", + "precursor_mz": 220.0, + "peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32), + }, + { + "id": "Demo spectrum 3", + "precursor_mz": 250.0, + "peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32), + }, + {"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]}, + ] + query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)} + + self.flash_entropy = FlashEntropySearch() + self.flash_entropy.build_index(spectral_library) + path_test = tempfile.mkdtemp() + self.flash_entropy.write(path_test) + self.flash_entropy = FlashEntropySearch(low_memory=1) + self.flash_entropy.read(path_test) + query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"]) + self.query_spectrum = query_spectrum + + def test_read_and_write(self): + pass + + +class TestFlashEntropySearchWithCpuMediumMemory(TestFlashEntropySearchWithCpu): + def setUp(self): + spectral_library = [ + {"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}, + { + "id": "Demo spectrum 2", + "precursor_mz": 220.0, + "peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32), + }, + { + "id": "Demo spectrum 3", + "precursor_mz": 250.0, + "peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32), + }, + {"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]}, + ] + query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)} + + self.flash_entropy = FlashEntropySearch() + self.flash_entropy.build_index(spectral_library) + path_test = tempfile.mkdtemp() + self.flash_entropy.write(path_test) + self.flash_entropy = FlashEntropySearch(low_memory=2) + self.flash_entropy.read(path_test) + query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"]) + self.query_spectrum = query_spectrum + + def test_read_and_write(self): + pass + + # class TestFlashEntropySearchWithGpu(TestFlashEntropySearchWithCpu): # def test_hybrid_search(self): # similarity = self.flash_entropy.hybrid_search(precursor_mz=self.query_spectrum['precursor_mz'],