Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add id and comment #110

Merged
merged 8 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
Changelog
=========

v1.0.0-dev
--------------------
+ Add ``id`` and ``comment`` properties to ``SequenceRecord``.

v0.10.0 (2022-12-05)
--------------------

Expand Down
4 changes: 4 additions & 0 deletions src/dnaio/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ class SequenceRecord:
def fastq_bytes(self, two_headers: bool = ...) -> bytes: ...
def is_mate(self, other: SequenceRecord) -> bool: ...
def reverse_complement(self) -> SequenceRecord: ...
@property
def id(self) -> str: ...
@property
def comment(self) -> Optional[str]: ...

# Bytestring = Union[bytes, bytearray, memoryview]. Technically just 'bytes' is
# acceptable as an alias, but even more technically this function supports all
Expand Down
59 changes: 58 additions & 1 deletion src/dnaio/_core.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnico
from cpython.object cimport Py_TYPE, PyTypeObject
from cpython.ref cimport PyObject
from cpython.tuple cimport PyTuple_GET_ITEM
from libc.string cimport memcmp, memcpy, memchr, strcspn, memmove
from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
cimport cython

cdef extern from "Python.h":
Expand Down Expand Up @@ -84,6 +84,8 @@ cdef class SequenceRecord:
object _name
object _sequence
object _qualities
object _id
object _comment

def __init__(self, object name, object sequence, object qualities = None):
if not PyUnicode_CheckExact(name):
Expand Down Expand Up @@ -119,6 +121,8 @@ cdef class SequenceRecord:
if not PyUnicode_IS_COMPACT_ASCII(name):
raise ValueError(is_not_ascii_message("name", name))
self._name = name
self._id = None
self._comment = None

@property
def sequence(self):
Expand Down Expand Up @@ -150,6 +154,59 @@ cdef class SequenceRecord:
)
self._qualities = qualities

@property
def id(self):
"""
The header part before any whitespace. This is the unique identifier
for the sequence.
"""
cdef char *name
cdef size_t name_length
cdef size_t id_length
# Not yet cached is None
if self._id is None:
name = <char *>PyUnicode_DATA(self._name)
name_length = <size_t>PyUnicode_GET_LENGTH(self._name)
id_length = strcspn(name, "\t ")
if id_length == name_length:
self._id = self._name
else:
self._id = PyUnicode_New(id_length, 127)
memcpy(PyUnicode_DATA(self._id), name, id_length)
return self._id

@property
def comment(self):
"""
The header part after the first whitespace. This is usually used
to store metadata. It may be empty in which case the attribute is None.
"""
cdef char *name
cdef size_t name_length
cdef size_t id_length
cdef char *comment_start
cdef size_t comment_length
# Not yet cached is None
if self._comment is None:
name = <char *>PyUnicode_DATA(self._name)
name_length = <size_t>PyUnicode_GET_LENGTH(self._name)
id_length = strcspn(name, "\t ")
if id_length == name_length:
self._comment = ""
else:
comment_start = name + id_length + 1
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, this assumes that there’s only one whitespace between the id and the comment. I’d assume this is true most of the time, but I think we need to be a bit more defensive.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally, a use for strspn!

# Skip empty whitespace before comment
comment_start = comment_start + strspn(comment_start, '\t ')
comment_length = name_length - (comment_start - name)
self._comment = PyUnicode_New(comment_length , 127)
memcpy(PyUnicode_DATA(self._comment), comment_start, comment_length)
# Empty comment is returned as None. This is not stored internally as
# None, otherwise the above code would run every time the attribute
# was accessed.
if PyUnicode_GET_LENGTH(self._comment) == 0:
return None
return self._comment

def __getitem__(self, key):
"""
Slice this SequenceRecord. If the qualities attribute is not None, it is
Expand Down
16 changes: 8 additions & 8 deletions tests/test_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
)
from dnaio.writers import FileWriter
from dnaio.readers import BinaryFileReader
from dnaio._core import bytes_ascii_check


TEST_DATA = Path(__file__).parent / "data"
SIMPLE_FASTQ = str(TEST_DATA / "simple.fastq")
Expand Down Expand Up @@ -635,8 +637,6 @@ def test_fastq_writer_repr(tmp_path):


class TestAsciiCheck:
from dnaio._core import bytes_ascii_check

ASCII_STRING = (
"In het Nederlands komen bijzondere leestekens niet vaak voor.".encode("ascii")
)
Expand All @@ -646,22 +646,22 @@ class TestAsciiCheck:
)

def test_ascii(self):
assert self.bytes_ascii_check(self.ASCII_STRING)
assert bytes_ascii_check(self.ASCII_STRING)

def test_ascii_all_chars(self):
assert self.bytes_ascii_check(bytes(range(128)))
assert not self.bytes_ascii_check(bytes(range(129)))
assert bytes_ascii_check(bytes(range(128)))
assert not bytes_ascii_check(bytes(range(129)))

def test_non_ascii(self):
assert not self.bytes_ascii_check(self.NON_ASCII_STRING)
assert not bytes_ascii_check(self.NON_ASCII_STRING)

def test_non_ascii_lengths(self):
# Make sure that the function finds the non-ascii byte correctly for
# all lengths.
non_ascii_char = "é".encode("latin-1")
for i in range(len(self.ASCII_STRING)):
test_string = self.ASCII_STRING[:i] + non_ascii_char
assert not self.bytes_ascii_check(test_string)
assert not bytes_ascii_check(test_string)

def test_ascii_lengths(self):
# Make sure the ascii check is correct even though there are non-ASCII
Expand All @@ -671,7 +671,7 @@ def test_ascii_lengths(self):
non_ascii_char = "é".encode("latin-1")
for i in range(1, len(self.ASCII_STRING) + 1):
test_string = self.ASCII_STRING[:i] + (non_ascii_char * 8)
assert self.bytes_ascii_check(test_string, i - 1)
assert bytes_ascii_check(test_string, i - 1)


class TestRecordsAreMates:
Expand Down
48 changes: 48 additions & 0 deletions tests/test_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,54 @@ def test_set_qualities_none(self):
seq.qualities = None
assert seq.qualities is None

def test_set_id(self):
seq = SequenceRecord("name", "A", "=")
with pytest.raises(AttributeError):
seq.id = "Obi-Wan"

def test_set_comment(self):
seq = SequenceRecord("name", "A", "=")
with pytest.raises(AttributeError):
seq.comment = "Hello there!"

@pytest.mark.parametrize(
["record", "expected"],
[
(SequenceRecord("name", "A", "="), None),
(SequenceRecord("name ", "A", "="), None),
(SequenceRecord("name ", "A", "="), None),
(SequenceRecord("name", "A", "="), None),
(SequenceRecord("AotC I hate sand!", "A", "="), "I hate sand!"),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😄

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Owner

@marcelm marcelm Aug 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I got the Star Wars reference, but didn’t know the song (and now it’ll be stuck in my head for the rest of the day ...)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well you could always listen to "Last jedi knight" by the same musician to get it out of your head again ;-).

(
SequenceRecord("Givemesome space", "A", "="),
"space",
),
],
)
def test_get_comment(self, record, expected):
assert record.comment == expected

@pytest.mark.parametrize(
["record", "expected"],
[
(SequenceRecord("name", "A", "="), "name"),
(SequenceRecord("name ", "A", "="), "name"),
(SequenceRecord("name ", "A", "="), "name"),
(SequenceRecord("name", "A", "="), "name"),
(SequenceRecord("AotC I hate sand!", "A", "="), "AotC"),
],
)
def test_get_id(self, record, expected):
assert record.id == expected

def test_reset_id_and_comment_on_name_update(self):
record = SequenceRecord("Obi-Wan: don't try it!", "", "")
assert record.id == "Obi-Wan:"
assert record.comment == "don't try it!"
record.name = "Anakin: you underestimate my power!"
assert record.id == "Anakin:"
assert record.comment == "you underestimate my power!"


def test_legacy_sequence():
from dnaio import Sequence
Expand Down
Loading