Skip to content

Commit

Permalink
feat: add methods for computing run lengths of repeats in a sequence (#…
Browse files Browse the repository at this point in the history
…162)

* feat: add methods for computing run lengths of repeats in a sequence

The `longest_dinucleotide_run_length` method is optimized for dinucleotides,
whereas the `longest_multinucleotide_run_length` method is general for
any repeat of length N.
  • Loading branch information
nh13 authored Aug 15, 2024
1 parent e022755 commit 1f20e56
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 19 deletions.
116 changes: 97 additions & 19 deletions fgpyo/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,29 +76,11 @@ def reverse_complement(bases: str) -> str:
return "".join([_COMPLEMENTS[b] for b in bases[::-1]])


def longest_hp_length(bases: str) -> int:
"""Calculates the length of the longest homopolymer in the input sequence."""
max_hp = 0
i = 0
# NB: if we have found a homopolymer of length `max_hp`, then we do not need
# to examine the last `max_hp` bases since we'll never find a longer one.
bases_len = len(bases)
while i < bases_len - max_hp:
base = bases[i]
j = i + 1
while j < bases_len and bases[j] == base:
j += 1
max_hp = max(max_hp, j - i)
# skip over all the bases in the current homopolymer
i = j
return max_hp


def gc_content(bases: str) -> float:
"""Calculates the fraction of G and C bases in a sequence."""
if len(bases) == 0:
return 0
gc_count = sum(1 for base in bases if base == "C" or base == "G" or base == "c" or base == "g")
gc_count = sum(1 for base in bases if base in "CGcg")
return gc_count / len(bases)


Expand Down Expand Up @@ -164,3 +146,99 @@ def levenshtein(string1: str, string2: str) -> int:
matrix[i + 1][j + 1], # Substitution
)
return matrix[0][0]


def longest_hp_length(bases: str) -> int:
"""Calculates the length of the longest homopolymer in the input sequence.
Args:
bases: the bases over which to compute
Return:
the length of the longest homopolymer
"""
return longest_homopolymer_length(bases=bases)


def longest_homopolymer_length(bases: str) -> int:
"""Calculates the length of the longest homopolymer in the input sequence.
Args:
bases: the bases over which to compute
Return:
the length of the longest homopolymer
"""
cur_length: int = 0
i = 0
# NB: if we have found a homopolymer of length `min_hp`, then we do not need
# to examine the last `min_hp` bases since we'll never find a longer one.
bases_len = len(bases)
while i < bases_len - cur_length:
base = bases[i].upper()
j = i + 1
while j < bases_len and bases[j] == base:
j += 1
cur_length = max(cur_length, j - i)
# skip over all the bases in the current homopolymer
i = j
return cur_length


def longest_dinucleotide_run_length(bases: str) -> int:
"""Number of bases in the longest dinucleotide run in a primer.
A dinucleotide run is when two nucleotides are repeated in tandem. For example,
TTGG (length = 4) or AACCAACCAA (length = 10). If there are no such runs, returns 0.
Args:
bases: the bases over which to compute
Return:
the number of bases in the longest dinuc repeat (NOT the number of repeat units)
"""
return longest_multinucleotide_run_length(bases=bases, repeat_unit_length=2)


def longest_multinucleotide_run_length(bases: str, repeat_unit_length: int) -> int:
"""Number of bases in the longest multi-nucleotide run.
A multi-nucleotide run is when N nucleotides are repeated in tandem. For example,
TTGG (length = 4, N=2) or TAGTAGTAG (length = 9, N = 3). If there are no such runs,
returns 0.
Args:
bases: the bases over which to compute
repeat_unit_length: the length of the multi-nucleotide repetitive unit (must be > 0)
Returns:
the number of bases in the longest multinucleotide repeat (NOT the number of repeat units)
"""
if repeat_unit_length <= 0:
raise ValueError(f"repeat_unit_length must be >= 0, found: {repeat_unit_length}")
elif len(bases) < repeat_unit_length:
return 0
elif len(bases) == repeat_unit_length:
return repeat_unit_length
elif repeat_unit_length == 1:
return longest_homopolymer_length(bases=bases)

best_length: int = 0
start = 0 # the start index of the current multi-nucleotide run
while start < len(bases) - 1:
# get the dinuc bases
dinuc = bases[start : start + repeat_unit_length].upper()
# keep going while there are more di-nucs
end = start + repeat_unit_length
while end < len(bases) - 1 and dinuc == bases[end : end + repeat_unit_length].upper():
end += repeat_unit_length
cur_length = end - start
# update the longest total run length
best_length = max(best_length, cur_length)
# move to the next start
if cur_length <= repeat_unit_length: # only one repeat unit found, move the start by 1bp
start += 1
else: # multiple repeats found, skip to the last base of the current run
start += cur_length - 1

return best_length
69 changes: 69 additions & 0 deletions tests/fgpyo/test_sequence.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
"""Tests for `~fgpyo.sequence`"""

from typing import List
from typing import Tuple

import pytest

from fgpyo.sequence import gc_content
from fgpyo.sequence import hamming
from fgpyo.sequence import levenshtein
from fgpyo.sequence import longest_dinucleotide_run_length
from fgpyo.sequence import longest_homopolymer_length
from fgpyo.sequence import longest_hp_length
from fgpyo.sequence import longest_multinucleotide_run_length
from fgpyo.sequence import reverse_complement


Expand Down Expand Up @@ -40,6 +46,7 @@ def test_reverse_complement(bases: str, expected_rev_comp: str) -> None:
)
def test_homopolymer(bases: str, expected_hp_len: int) -> None:
assert longest_hp_length(bases) == expected_hp_len
assert longest_homopolymer_length(bases) == expected_hp_len


@pytest.mark.parametrize(
Expand Down Expand Up @@ -111,3 +118,65 @@ def test_hamming_with_invalid_strings(string1: str, string2: str) -> None:
)
def test_levenshtein_dynamic(string1: str, string2: str, levenshtein_distance: int) -> None:
assert levenshtein(string1, string2) == levenshtein_distance


MULTINUCLEOTIDE_TEST_CASES: List[Tuple[str, int, int]] = [
("", 2, 0),
("", 1, 0),
("A", 1, 1),
("A", 2, 0),
("ACGT", 2, 2),
("AACCGGTT", 2, 2),
("TTTTCCCGGA", 2, 4),
("ACCGGGTTTT", 2, 4),
("GTGTGTAAAA", 2, 6),
("GTGTGTGAAAA", 2, 6),
("GTGTGTACACACAC", 2, 8),
("GTGTGTGACACACAC", 2, 8),
("AAACTCTCTCGG", 2, 6),
("AACTCTCTCGGG", 2, 6),
("TGTATATATA", 2, 8),
("TGTATATATAC", 2, 8),
("TGATATATATC", 2, 8),
("TGATATATAC", 2, 6),
("TAGTAGTAG", 3, 9),
("TATAGTAGTAGTA", 3, 9),
("TACGTACGTACTG", 4, 8),
("TAGTAGTAGTAG", 3, 12),
("TAGTAGTAGTAG", 6, 12),
("TAGTAGTAGTAG", 12, 12),
("AACCGGTT", 1, 2),
("AACCGGTT", 3, 3),
("TAGTAGTAGTAG", 5, 5),
("TAGTAGTAGTAG", 7, 7),
("TTTAAAAAAAAAATTT", 5, 10),
("ACGACCATATatatatatatGC", 2, 14),
("ACGACCATATatatatatatATGC", 2, 16),
("ttgaTtaCaGATTACAgattacacc", 7, 21),
]

DINUCLEOTIDE_TEST_CASES: List[Tuple[str, int]] = [
(bases, expected_length)
for bases, repeat_unit_length, expected_length in MULTINUCLEOTIDE_TEST_CASES
if repeat_unit_length == 2
]


@pytest.mark.parametrize("bases, expected_length", DINUCLEOTIDE_TEST_CASES)
def test_longest_dinucleotide_run_length(bases: str, expected_length: int) -> None:
assert expected_length == longest_dinucleotide_run_length(bases=bases)


@pytest.mark.parametrize("bases, repeat_unit_length, expected_length", MULTINUCLEOTIDE_TEST_CASES)
def test_longest_multinucleotide_run_length(
bases: str, repeat_unit_length: int, expected_length: int
) -> None:
assert expected_length == 0 or expected_length % repeat_unit_length == 0
assert expected_length == longest_multinucleotide_run_length(
bases, repeat_unit_length=repeat_unit_length
)


def test_longest_multinucleotide_run_length_raises() -> None:
with pytest.raises(ValueError, match="repeat_unit_length must be >= 0"):
longest_multinucleotide_run_length(bases="GATTACA", repeat_unit_length=0)

0 comments on commit 1f20e56

Please sign in to comment.