Skip to content

Commit

Permalink
Add the function of for medium memory.
Browse files Browse the repository at this point in the history
Close opened file in low memory mode.
  • Loading branch information
YuanyueLi committed May 22, 2024
1 parent 7be7594 commit 0390ac0
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 11 deletions.
11 changes: 8 additions & 3 deletions docs/source/entropy_search_advanced_usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ Run Flash entropy search with limited memory

This method is useful when you are dealing with a very large spectral library and your computer's memory is limited.

To achieve this, while constructing the ``FlashEntropySearch`` object, you need to set the ``path_data`` parameter to the path of the index file, and set the ``low_memory`` parameter to ``True``. Then read the pre-built index file by calling the ``read`` method. After that, the rest of the code is the same as usual.
To achieve this, while constructing the ``FlashEntropySearch`` object, you need to set the ``path_data`` parameter to the path of the index file, and set the ``low_memory`` parameter to ``1`` or ``2``. Then read the pre-built index file by calling the ``read`` method. After that, the rest of the code is the same as usual.

The ``low_memory`` parameter has three values:
- False or 0: Normal mode. All the index will be loaded into memory, this is the default mode and the fastest mode.
- True or 1: Low memory mode. Only load the nessessary data into memory, this mode needs the lowest memory, but the search speed will be the slowest, as it will read all the data from the disk every time.
- 2: Low memory mode use memmap. This mode is similar to mode 1, but it will use the ``numpy.memmap`` to map the index file to memory, which will be faster than mode 1 if the memory is not too small.

.. code-block:: python
Expand All @@ -17,14 +22,14 @@ To achieve this, while constructing the ``FlashEntropySearch`` object, you need
# Instead of using this:
# entropy_search = FlashEntropySearch()
# Use this:
entropy_search = FlashEntropySearch(path_data='path/to/library/index', low_memory=True)
entropy_search = FlashEntropySearch(path_data='path/to/library/index', low_memory=1)
entropy_search.read()
# Then the reset of the code is the same as usual.
# entropy_search.search(...)
# ...... (the reset of the code is the same as usual)
The index built in normal mode and low memory mode is identical. If you use our ``write`` and ``read`` methods to save and load the index, you can use the index in normal mode and low memory mode interchangeably. For example, you can build the index in normal mode, save it to disk with the ``write`` method. After that, you can initialize the ``FlashEntropySearch`` object with ``path_data`` parameter which points to the index file, and set ``low_memory`` parameter to ``True``, then call the ``read`` method to load the index, and proceed with the search as usual.
The index built in normal mode and low memory mode is identical. If you use our ``write`` and ``read`` methods to save and load the index, you can use the index in normal mode and low memory mode interchangeably. For example, you can build the index in normal mode, save it to disk with the ``write`` method. After that, you can initialize the ``FlashEntropySearch`` object with ``path_data`` parameter which points to the index file, and set ``low_memory`` parameter to ``1``, then call the ``read`` method to load the index, and proceed with the search as usual.


Run Flash entropy search with multiple cores
Expand Down
6 changes: 2 additions & 4 deletions ms_entropy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
calculate_unweighted_entropy_similarity,
apply_weight_to_intensity,
)
from .file_io import (
read_one_spectrum, standardize_spectrum
)
from .entropy_search import FlashEntropySearch, FlashEntropySearchCore, FlashEntropySearchCoreLowMemory
from .file_io import read_one_spectrum, standardize_spectrum
from .entropy_search import FlashEntropySearch, FlashEntropySearchCore, FlashEntropySearchCoreLowMemory, FlashEntropySearchCoreMediumMemory
from .version import __version__
3 changes: 2 additions & 1 deletion ms_entropy/entropy_search/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .flash_entropy_search import FlashEntropySearch
from .flash_entropy_search_core import FlashEntropySearchCore
from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory
from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory
from .flash_entropy_search_core_medium_memory import FlashEntropySearchCoreMediumMemory
7 changes: 6 additions & 1 deletion ms_entropy/entropy_search/flash_entropy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,22 @@
from pathlib import Path
from .flash_entropy_search_core import FlashEntropySearchCore
from .flash_entropy_search_core_low_memory import FlashEntropySearchCoreLowMemory
from .flash_entropy_search_core_medium_memory import FlashEntropySearchCoreMediumMemory
from ..spectra import clean_spectrum


class FlashEntropySearch:
def __init__(self, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, low_memory=False, path_data=None):
self.precursor_mz_array = np.zeros(0, dtype=np.float32)
self.low_memory = low_memory
if low_memory:
if low_memory==1:
self.entropy_search = FlashEntropySearchCoreLowMemory(
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
)
elif low_memory==2:
self.entropy_search = FlashEntropySearchCoreMediumMemory(
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
)
else:
self.entropy_search = FlashEntropySearchCore(path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.000
self.path_data.mkdir(parents=True, exist_ok=True)
self.index_file = []

def __del__(self):
for file in self.index_file:
file.close()

def _generate_index_from_peak_data(self, peak_data, max_indexed_mz, append):
total_peaks_num = peak_data.shape[0]

Expand Down
101 changes: 101 additions & 0 deletions ms_entropy/entropy_search/flash_entropy_search_core_medium_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python3
import json
import numpy as np
from pathlib import Path
from .flash_entropy_search_core import FlashEntropySearchCore


class FlashEntropySearchCoreMediumMemory(FlashEntropySearchCore):
def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
self.path_data = Path(str(path_data))
self.path_data.mkdir(parents=True, exist_ok=True)

def _generate_index_from_peak_data(self, peak_data, max_indexed_mz, append):
total_peaks_num = peak_data.shape[0]

# Sort with precursor m/z.
peak_data.sort(order="ion_mz")

# Record the m/z, intensity, and spectrum index information for product ions.
(peak_data["ion_mz"]).tofile(self.path_data / "all_ions_mz.npy")
(peak_data["intensity"]).tofile(self.path_data / "all_ions_intensity.npy")
(peak_data["spec_idx"]).tofile(self.path_data / "all_ions_spec_idx.npy")

# all_ions_mz = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 0], np.float32, "all_ions_mz")
# all_ions_intensity = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 2], np.float32, "all_ions_intensity")
# all_ions_spec_idx = self._convert_view_to_array(peak_data.view(np.uint32).reshape(total_peaks_num, -1)[:, 3], np.uint32, "all_ions_spec_idx")

# Assign the index of the product ions.
peak_data["peak_idx"] = np.arange(0, self.total_peaks_num, dtype=np.uint64)

# Build index for fast access to the ion's m/z.
all_ions_mz = np.memmap(self.path_data / "all_ions_mz.npy", dtype=np.float32, mode="r", shape=(total_peaks_num,))
max_mz = min(np.max(all_ions_mz), max_indexed_mz)
search_array = np.arange(0.0, max_mz, self.mz_index_step)
all_ions_mz_idx_start = np.searchsorted(all_ions_mz, search_array, side="left").astype(np.int64)
all_ions_mz_idx_start.tofile(self.path_data / "all_ions_mz_idx_start.npy")

############## Step 3: Build the index by sort with neutral loss mass. ##############
# Sort with the neutral loss mass.
peak_data.sort(order="nl_mass")

# Record the m/z, intensity, spectrum index, and product ions index information for neutral loss ions.
(peak_data["nl_mass"]).tofile(self.path_data / "all_nl_mass.npy")
(peak_data["intensity"]).tofile(self.path_data / "all_nl_intensity.npy")
(peak_data["spec_idx"]).tofile(self.path_data / "all_nl_spec_idx.npy")
(peak_data["peak_idx"]).tofile(self.path_data / "all_ions_idx_for_nl.npy")

# all_nl_mass = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 1], np.float32, "all_nl_mass")
# all_nl_intensity = self._convert_view_to_array(peak_data.view(np.float32).reshape(total_peaks_num, -1)[:, 2], np.float32, "all_nl_intensity")
# all_nl_spec_idx = self._convert_view_to_array(peak_data.view(np.uint32).reshape(total_peaks_num, -1)[:, 3], np.uint32, "all_nl_spec_idx")
# all_ions_idx_for_nl = self._convert_view_to_array(peak_data.view(np.uint64).reshape(total_peaks_num, -1)[:, 2], np.uint64, "all_ions_idx_for_nl")

# Build the index for fast access to the neutral loss mass.
all_nl_mass = np.memmap(self.path_data / "all_nl_mass.npy", dtype=np.float32, mode="r", shape=(total_peaks_num,))
max_mz = min(np.max(all_nl_mass), max_indexed_mz)
search_array = np.arange(0.0, max_mz, self.mz_index_step)
all_nl_mass_idx_start = np.searchsorted(all_nl_mass, search_array, side="left").astype(np.int64)
all_nl_mass_idx_start.tofile(self.path_data / "all_nl_mass_idx_start.npy")

############## Step 4: Save the index. ##############
self.write()
self.read()
return self.index

def read(self, path_data=None):
"""
Read the index from the file.
"""
if path_data is not None:
self.path_data = Path(path_data)

try:
self.index = []
for name in self.index_names:
self.index.append(np.memmap(self.path_data / f"{name}.npy", dtype=self.index_dtypes[name], mode="r"))

with open(self.path_data / "information.json", "r") as f:
information = json.load(f)
self.mz_index_step = information["mz_index_step"]
self.total_spectra_num = information["total_spectra_num"]
self.total_peaks_num = information["total_peaks_num"]
self.max_ms2_tolerance_in_da = information["max_ms2_tolerance_in_da"]
return True
except:
return False

def write(self, path_data=None):
"""
Write the index to the file.
"""
if path_data is not None:
assert Path(path_data) == self.path_data, "The path_data is not the same as the path_data in the class."

information = {
"mz_index_step": float(self.mz_index_step),
"total_spectra_num": int(self.total_spectra_num),
"total_peaks_num": int(self.total_peaks_num),
"max_ms2_tolerance_in_da": float(self.max_ms2_tolerance_in_da),
}
json.dump(information, open(self.path_data / "information.json", "w"))
2 changes: 1 addition & 1 deletion ms_entropy/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.1.3'
__version__ = '1.2.0'
64 changes: 63 additions & 1 deletion tests/test_entropy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_open_search(self):
np.testing.assert_almost_equal(similarity, [1.0, 0.22299, 0.22299, 0.44598], decimal=5)
similarity, matched_peaks = self.flash_entropy.open_search(
peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02, output_matched_peak_number=True
)
)
np.testing.assert_almost_equal(similarity, [1.0, 0.22299, 0.22299, 0.44598], decimal=5)
np.testing.assert_almost_equal(matched_peaks, [4, 1, 1, 2], decimal=5)

Expand All @@ -77,6 +77,68 @@ def test_identity_search(self):
np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5)


class TestFlashEntropySearchWithCpuLowMemory(TestFlashEntropySearchWithCpu):
def setUp(self):
spectral_library = [
{"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)},
{
"id": "Demo spectrum 2",
"precursor_mz": 220.0,
"peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32),
},
{
"id": "Demo spectrum 3",
"precursor_mz": 250.0,
"peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32),
},
{"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]},
]
query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}

self.flash_entropy = FlashEntropySearch()
self.flash_entropy.build_index(spectral_library)
path_test = tempfile.mkdtemp()
self.flash_entropy.write(path_test)
self.flash_entropy = FlashEntropySearch(low_memory=1)
self.flash_entropy.read(path_test)
query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"])
self.query_spectrum = query_spectrum

def test_read_and_write(self):
pass


class TestFlashEntropySearchWithCpuMediumMemory(TestFlashEntropySearchWithCpu):
def setUp(self):
spectral_library = [
{"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)},
{
"id": "Demo spectrum 2",
"precursor_mz": 220.0,
"peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32),
},
{
"id": "Demo spectrum 3",
"precursor_mz": 250.0,
"peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32),
},
{"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]},
]
query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 1.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}

self.flash_entropy = FlashEntropySearch()
self.flash_entropy.build_index(spectral_library)
path_test = tempfile.mkdtemp()
self.flash_entropy.write(path_test)
self.flash_entropy = FlashEntropySearch(low_memory=2)
self.flash_entropy.read(path_test)
query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"])
self.query_spectrum = query_spectrum

def test_read_and_write(self):
pass


# class TestFlashEntropySearchWithGpu(TestFlashEntropySearchWithCpu):
# def test_hybrid_search(self):
# similarity = self.flash_entropy.hybrid_search(precursor_mz=self.query_spectrum['precursor_mz'],
Expand Down

0 comments on commit 0390ac0

Please sign in to comment.