marcelm · marcelm · Aug 31, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,6 +2,10 @@
 Changelog
 =========
 
+v1.0.0-dev
+--------------------
++ Add ``id`` and ``comment`` properties to ``SequenceRecord``.
+
 v0.10.0 (2022-12-05)
 --------------------
 

diff --git a/src/dnaio/_core.pyi b/src/dnaio/_core.pyi
@@ -24,6 +24,10 @@ class SequenceRecord:
     def fastq_bytes(self, two_headers: bool = ...) -> bytes: ...
     def is_mate(self, other: SequenceRecord) -> bool: ...
     def reverse_complement(self) -> SequenceRecord: ...
+    @property
+    def id(self) -> str: ...
+    @property
+    def comment(self) -> Optional[str]: ...
 
 # Bytestring = Union[bytes, bytearray, memoryview]. Technically just 'bytes' is
 # acceptable as an alias, but even more technically this function supports all

diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx
@@ -7,7 +7,7 @@ from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnico
 from cpython.object cimport Py_TYPE, PyTypeObject
 from cpython.ref cimport PyObject
 from cpython.tuple cimport PyTuple_GET_ITEM
-from libc.string cimport memcmp, memcpy, memchr, strcspn, memmove
+from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
 cimport cython
 
 cdef extern from "Python.h":
@@ -84,6 +84,8 @@ cdef class SequenceRecord:
         object _name
         object _sequence
         object _qualities
+        object _id
+        object _comment
 
     def __init__(self, object name, object sequence, object qualities = None):
         if not PyUnicode_CheckExact(name):
@@ -119,6 +121,8 @@ cdef class SequenceRecord:
         if not PyUnicode_IS_COMPACT_ASCII(name):
             raise ValueError(is_not_ascii_message("name", name))
         self._name = name
+        self._id = None
+        self._comment = None
 
     @property
     def sequence(self):
@@ -150,6 +154,59 @@ cdef class SequenceRecord:
             )
         self._qualities = qualities
 
+    @property
+    def id(self):
+        """
+        The header part before any whitespace. This is the unique identifier
+        for the sequence.
+        """
+        cdef char *name
+        cdef size_t name_length
+        cdef size_t id_length
+        # Not yet cached is None
+        if self._id is None:
+            name = <char *>PyUnicode_DATA(self._name)
+            name_length = <size_t>PyUnicode_GET_LENGTH(self._name)
+            id_length = strcspn(name, "\t ")
+            if id_length == name_length:
+                self._id = self._name
+            else:
+                self._id = PyUnicode_New(id_length, 127)
+                memcpy(PyUnicode_DATA(self._id), name, id_length)
+        return self._id
+
+    @property
+    def comment(self):
+        """
+        The header part after the first whitespace. This is usually used
+        to store metadata. It may be empty in which case the attribute is None.
+        """
+        cdef char *name
+        cdef size_t name_length
+        cdef size_t id_length
+        cdef char *comment_start
+        cdef size_t comment_length
+        # Not yet cached is None
+        if self._comment is None:
+            name = <char *>PyUnicode_DATA(self._name)
+            name_length = <size_t>PyUnicode_GET_LENGTH(self._name)
+            id_length = strcspn(name, "\t ")
+            if id_length == name_length:
+                self._comment = ""
+            else:
+                comment_start = name + id_length + 1
+                # Skip empty whitespace before comment
+                comment_start = comment_start + strspn(comment_start, '\t ')
+                comment_length = name_length - (comment_start - name)
+                self._comment = PyUnicode_New(comment_length , 127)
+                memcpy(PyUnicode_DATA(self._comment), comment_start, comment_length)
+        # Empty comment is returned as None. This is not stored internally as
+        # None, otherwise the above code would run every time the attribute
+        # was accessed.
+        if PyUnicode_GET_LENGTH(self._comment) == 0:
+            return None
+        return self._comment
+
     def __getitem__(self, key):
         """
         Slice this SequenceRecord. If the qualities attribute is not None, it is

diff --git a/tests/test_internal.py b/tests/test_internal.py
@@ -28,6 +28,8 @@
 )
 from dnaio.writers import FileWriter
 from dnaio.readers import BinaryFileReader
+from dnaio._core import bytes_ascii_check
+
 
 TEST_DATA = Path(__file__).parent / "data"
 SIMPLE_FASTQ = str(TEST_DATA / "simple.fastq")
@@ -635,8 +637,6 @@ def test_fastq_writer_repr(tmp_path):
 
 
 class TestAsciiCheck:
-    from dnaio._core import bytes_ascii_check
-
     ASCII_STRING = (
         "In het Nederlands komen bijzondere leestekens niet vaak voor.".encode("ascii")
     )
@@ -646,22 +646,22 @@ class TestAsciiCheck:
     )
 
     def test_ascii(self):
-        assert self.bytes_ascii_check(self.ASCII_STRING)
+        assert bytes_ascii_check(self.ASCII_STRING)
 
     def test_ascii_all_chars(self):
-        assert self.bytes_ascii_check(bytes(range(128)))
-        assert not self.bytes_ascii_check(bytes(range(129)))
+        assert bytes_ascii_check(bytes(range(128)))
+        assert not bytes_ascii_check(bytes(range(129)))
 
     def test_non_ascii(self):
-        assert not self.bytes_ascii_check(self.NON_ASCII_STRING)
+        assert not bytes_ascii_check(self.NON_ASCII_STRING)
 
     def test_non_ascii_lengths(self):
         # Make sure that the function finds the non-ascii byte correctly for
         # all lengths.
         non_ascii_char = "é".encode("latin-1")
         for i in range(len(self.ASCII_STRING)):
             test_string = self.ASCII_STRING[:i] + non_ascii_char
-            assert not self.bytes_ascii_check(test_string)
+            assert not bytes_ascii_check(test_string)
 
     def test_ascii_lengths(self):
         # Make sure the ascii check is correct even though there are non-ASCII
@@ -671,7 +671,7 @@ def test_ascii_lengths(self):
         non_ascii_char = "é".encode("latin-1")
         for i in range(1, len(self.ASCII_STRING) + 1):
             test_string = self.ASCII_STRING[:i] + (non_ascii_char * 8)
-            assert self.bytes_ascii_check(test_string, i - 1)
+            assert bytes_ascii_check(test_string, i - 1)
 
 
 class TestRecordsAreMates:

diff --git a/tests/test_records.py b/tests/test_records.py
@@ -105,6 +105,54 @@ def test_set_qualities_none(self):
         seq.qualities = None
         assert seq.qualities is None
 
+    def test_set_id(self):
+        seq = SequenceRecord("name", "A", "=")
+        with pytest.raises(AttributeError):
+            seq.id = "Obi-Wan"
+
+    def test_set_comment(self):
+        seq = SequenceRecord("name", "A", "=")
+        with pytest.raises(AttributeError):
+            seq.comment = "Hello there!"
+
+    @pytest.mark.parametrize(
+        ["record", "expected"],
+        [
+            (SequenceRecord("name", "A", "="), None),
+            (SequenceRecord("name ", "A", "="), None),
+            (SequenceRecord("name  ", "A", "="), None),
+            (SequenceRecord("name", "A", "="), None),
+            (SequenceRecord("AotC I hate sand!", "A", "="), "I hate sand!"),
+            (
+                SequenceRecord("Givemesome                       space", "A", "="),
+                "space",
+            ),
+        ],
+    )
+    def test_get_comment(self, record, expected):
+        assert record.comment == expected
+
+    @pytest.mark.parametrize(
+        ["record", "expected"],
+        [
+            (SequenceRecord("name", "A", "="), "name"),
+            (SequenceRecord("name ", "A", "="), "name"),
+            (SequenceRecord("name  ", "A", "="), "name"),
+            (SequenceRecord("name", "A", "="), "name"),
+            (SequenceRecord("AotC I hate sand!", "A", "="), "AotC"),
+        ],
+    )
+    def test_get_id(self, record, expected):
+        assert record.id == expected
+
+    def test_reset_id_and_comment_on_name_update(self):
+        record = SequenceRecord("Obi-Wan: don't try it!", "", "")
+        assert record.id == "Obi-Wan:"
+        assert record.comment == "don't try it!"
+        record.name = "Anakin: you underestimate my power!"
+        assert record.id == "Anakin:"
+        assert record.comment == "you underestimate my power!"
+
 
 def test_legacy_sequence():
     from dnaio import Sequence