Skip to content

Commit

Permalink
Add the function to calculate unweighted entropy similarity.
Browse files Browse the repository at this point in the history
  • Loading branch information
YuanyueLi committed Jun 25, 2024
1 parent 0ec97f4 commit dbecc06
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 14 deletions.
31 changes: 25 additions & 6 deletions ms_entropy/entropy_search/flash_entropy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,38 @@


class FlashEntropySearch:
def __init__(self, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, low_memory=False, path_data=None):
def __init__(
self,
max_ms2_tolerance_in_da=0.024,
mz_index_step=0.0001,
low_memory=False,
path_data=None,
intensity_weight="entropy",
):
"""
Initialize the EntropySearch class.
:param max_ms2_tolerance_in_da: The maximum MS2 tolerance in Da.
:param mz_index_step: The step size for the m/z index.
:param low_memory: The memory usage mode, can be 0, 1, or 2. 0 means normal mode, 1 means low memory mode, and 2 means medium memory mode.
:param path_data: The path to save the index data.
:param intensity_weight: The weight for the intensity in the entropy calculation, can be "entropy" or None. Default is "entropy".
- None: The intensity will not be weighted, then the unweighted similarity will be calculated.
- "entropy": The intensity will be weighted by the entropy, then the entropy similarity will be calculated.
"""
self.precursor_mz_array = np.zeros(0, dtype=np.float32)
self.low_memory = low_memory
if low_memory==1:
if low_memory == 1:
self.entropy_search = FlashEntropySearchCoreLowMemory(
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight
)
elif low_memory==2:
elif low_memory == 2:
self.entropy_search = FlashEntropySearchCoreMediumMemory(
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight
)
else:
self.entropy_search = FlashEntropySearchCore(path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
self.entropy_search = FlashEntropySearchCore(
path_data=path_data, max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight
)

def identity_search(self, precursor_mz, peaks, ms1_tolerance_in_da, ms2_tolerance_in_da, target="cpu", output_matched_peak_number=False, **kwargs):
"""
Expand Down
20 changes: 17 additions & 3 deletions ms_entropy/entropy_search/flash_entropy_search_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,26 @@


class FlashEntropySearchCore:
def __init__(self, path_data=None, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
def __init__(
self,
path_data=None,
max_ms2_tolerance_in_da=0.024,
mz_index_step=0.0001,
intensity_weight="entropy", # "entropy" or None
) -> None:
"""
Initialize the EntropySearch class.
:param path_array: The path array of the index files.
:param max_ms2_tolerance_in_da: The maximum MS2 tolerance used when searching the MS/MS spectra, in Dalton. Default is 0.024.
:param mz_index_step: The step size of the m/z index, in Dalton. Default is 0.0001.
The smaller the step size, the faster the search, but the larger the index size and longer the index building time.
:param intensity_weight: The weight of the intensity, can be "entropy" or None. If set to "entropy", the intensity will be weighted by the entropy.
If set to None, the intensity will not be weighted, which is equivalent to the unweighted entropy similarity.
"""
self.mz_index_step = mz_index_step
self._init_for_multiprocessing = False
self.max_ms2_tolerance_in_da = max_ms2_tolerance_in_da
self.intensity_weight = intensity_weight

self.total_spectra_num = 0
self.total_peaks_num = 0
Expand Down Expand Up @@ -511,8 +520,13 @@ def _preprocess_peaks(self, peaks):
"""
Preprocess the peaks.
"""
peaks_clean = np.asarray(apply_weight_to_intensity(peaks))
peaks_clean[:, 1] /= 2
if self.intensity_weight == "entropy":
peaks_clean = np.asarray(apply_weight_to_intensity(peaks))
peaks_clean[:, 1] /= 2
elif self.intensity_weight is None:
peaks_clean = peaks.copy()
peaks_clean[:, 1] /= 2

return peaks_clean

def _score_peaks_with_cpu(self, intensity_query, intensity_library):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@


class FlashEntropySearchCoreLowMemory(FlashEntropySearchCore):
def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, intensity_weight="entropy") -> None:
super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight)
self.path_data = Path(str(path_data))
self.path_data.mkdir(parents=True, exist_ok=True)
self.index_file = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@


class FlashEntropySearchCoreMediumMemory(FlashEntropySearchCore):
def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001) -> None:
super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step)
def __init__(self, path_data, max_ms2_tolerance_in_da=0.024, mz_index_step=0.0001, intensity_weight="entropy") -> None:
super().__init__(max_ms2_tolerance_in_da=max_ms2_tolerance_in_da, mz_index_step=mz_index_step, intensity_weight=intensity_weight)
self.path_data = Path(str(path_data))
self.path_data.mkdir(parents=True, exist_ok=True)

Expand Down
2 changes: 1 addition & 1 deletion ms_entropy/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.2.2'
__version__ = '1.3.0'
48 changes: 48 additions & 0 deletions tests/test_entropy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,54 @@ def test_identity_search(self):
np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5)


class TestUnweightedFlashEntropySearchWithCpu(unittest.TestCase):
def setUp(self):
spectral_library = [
{"id": "Demo spectrum 1", "precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 10.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)},
{
"id": "Demo spectrum 2",
"precursor_mz": 220.0,
"peaks": np.array([[200.0, 1.0], [101.0, 1.0], [202.0, 1.0], [204.0, 1.0], [205.0, 1.0]], dtype=np.float32),
},
{
"id": "Demo spectrum 3",
"precursor_mz": 250.0,
"peaks": np.array([[100.0, 1.0], [201.0, 1.0], [202.0, 1.0], [104.0, 1.0], [105.0, 1.0]], dtype=np.float32),
},
{"id": "Demo spectrum 4", "precursor_mz": 350.0, "peaks": [[100.0, 1.0], [101.0, 1.0], [302.0, 1.0], [104.0, 1.0], [105.0, 1.0]]},
]
query_spectrum = {"precursor_mz": 150.0, "peaks": np.array([[100.0, 1.0], [101.0, 10.0], [102.0, 1.0], [103.0, 1.0]], dtype=np.float32)}

self.flash_entropy = FlashEntropySearch(intensity_weight=None)
self.flash_entropy.build_index(spectral_library)
query_spectrum["peaks"] = self.flash_entropy.clean_spectrum_for_search(precursor_mz=query_spectrum["precursor_mz"], peaks=query_spectrum["peaks"])
self.query_spectrum = query_spectrum

def test_open_search(self):
similarity = self.flash_entropy.open_search(peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02)
np.testing.assert_almost_equal(similarity, [1.0, 0.356, 0.118, 0.474], decimal=3)
similarity, matched_peaks = self.flash_entropy.open_search(
peaks=self.query_spectrum["peaks"], ms2_tolerance_in_da=0.02, output_matched_peak_number=True
)
np.testing.assert_almost_equal(similarity, [1.0, 0.356, 0.118, 0.474], decimal=3)
np.testing.assert_almost_equal(matched_peaks, [4, 1, 1, 2], decimal=5)

def test_identity_search(self):
similarity = self.flash_entropy.identity_search(
precursor_mz=self.query_spectrum["precursor_mz"], peaks=self.query_spectrum["peaks"], ms1_tolerance_in_da=0.01, ms2_tolerance_in_da=0.02
)
np.testing.assert_almost_equal(similarity, [1.0, 0.0, 0.0, 0.0], decimal=5)
similarity, matched_peaks = self.flash_entropy.identity_search(
precursor_mz=self.query_spectrum["precursor_mz"],
peaks=self.query_spectrum["peaks"],
ms1_tolerance_in_da=0.01,
ms2_tolerance_in_da=0.02,
output_matched_peak_number=True,
)
np.testing.assert_almost_equal(similarity, [1.0, 0.0, 0.0, 0.0], decimal=5)
np.testing.assert_almost_equal(matched_peaks, [4, 0, 0, 0], decimal=5)


class TestFlashEntropySearchWithCpuLowMemory(TestFlashEntropySearchWithCpu):
def setUp(self):
spectral_library = [
Expand Down

0 comments on commit dbecc06

Please sign in to comment.