Skip to content

Commit

Permalink
feat: separate classes for sparse and dense formats
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Jun 22, 2023
1 parent e356234 commit 5429811
Show file tree
Hide file tree
Showing 11 changed files with 188 additions and 19 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## Version 0.1 (development)
## Version 0.0.1

- initial classes for H5 backed matrices

## Version 0.0.3

- separate dense and sparse matrix classes
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ pip install filebackedarray
## Usage

```python
from filebackedarray import H5BackedAssay
from filebackedarray import H5BackedSparseData

matrix = H5BackedAssay("tests/data/tenx.sub.h5", group="matrix")
matrix = H5BackedSparseData("tests/data/tenx.sub.h5", group="matrix")

# get the dimensions of the matrix
print("matrix shape: ", matrix.shape)
Expand All @@ -38,7 +38,7 @@ print("matrix shape: ", matrix.shape)
matrix_slice = matrix[0:100, 1:101]
```

For more use cases including subset, checkout the [documentation](https://biocpy.github.io/FileBackedArray/)
Checkout the [documentation](https://biocpy.github.io/FileBackedArray/) for more info.



Expand Down
24 changes: 22 additions & 2 deletions docs/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,34 @@

This package provides classes to support file backed arrays or matrices stored in H5 files. We'll soon provide similar implementations for tiledb, zarr and other matrix storage formats.

## Sparse matrices

`H5BackedSparseData` tries to infer the sparse matrix format, either `csr_matrix` or `csc_matrix` from the h5.

```python
from filebackedarray import H5BackedAssay
from filebackedarray import H5BackedSparseData

matrix = H5BackedAssay("tests/data/tenx.sub.h5", group="matrix")
matrix = H5BackedSparseData("tests/data/tenx.sub.h5", group="matrix")

# get the dimensions of the matrix
print("matrix shape: ", matrix.shape)

# slice the matrix
matrix_slice = matrix[0:100, 1:101]
```

## Dense matrices

by default the matrix is assumed to be stored in C-style (row-major format). If the h5 file stores the matrix in a column-major format (Fortran-style), you can specify the `order="F"` parameter.

```python
from filebackedarray import H5BackedDenseData

matrix = H5BackedDenseData("tests/data/dense.h5", group="dense_C")

# get the dimensions of the matrix
print("matrix shape: ", matrix.shape)

# slice the matrix
matrix_slice = matrix[0:10, 1:10]
```
97 changes: 97 additions & 0 deletions src/filebackedarray/H5Dense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from typing import Optional, Sequence, Tuple, Union

import h5py

from .utils import _check_indices, infer_h5_dataset

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


class H5BackedDenseData:
"""H5 backed dense matrix or array store.
Args:
path (str): Path to the H5 file.
group (str): Group inside the file that contains the matrix or array.
order (str): dense matrix representation, ‘C’, ‘F’,
row-major (C-style) or column-major (Fortran-style) order.
"""

def __init__(self, path: str, group: str, order: str = "C") -> None:
"""Initialize a H5 Backed array.
Args:
path (str): Path to the H5 file.
group (str): Group inside the file that contains the matrix or array.
order (str): dense matrix representation, ‘C’, ‘F’,
row-major (C-style) or column-major (Fortran-style) order.
"""
self._h5file = h5py.File(path, mode="r")
self._dataset = self._h5file[group]
self._dataset_info = infer_h5_dataset(self._dataset)
self._order = order

if self._dataset_info.format != "dense":
raise ValueError("File does not contain a dense matrix")

@property
def shape(self) -> Tuple[int, int]:
"""Get shape of the dataset.
Returns:
Tuple[int, int]: number of rows by columns.
"""
if self._order == "C":
return self._dataset_info.shape
else:
return self._dataset_info.shape[::-1]

@property
def dtype(self) -> str:
"""Get type of values stored in the dataset.
Returns:
str: type of dataset, e.g. int8, float etc.
"""
return self._dataset_info.dtype

@property
def mat_format(self) -> str:
"""Get dense matrix format.
either row-major (C-style) or column-major (Fortran-style) order.
Returns:
str: matrix format.
"""
return self._order

def __getitem__(
self,
args: Tuple[Union[slice, Sequence[int]], Optional[Union[slice, Sequence[int]]]],
):
if len(args) == 0:
raise ValueError("Arguments must contain one slice")

rowIndices = _check_indices(args[0])
colIndices = None

if len(args) > 1:
if args[1] is not None:
colIndices = _check_indices(args[1])
elif len(args) > 2:
raise ValueError("contains too many slices")

if colIndices is None:
colIndices = slice(0)

if self.mat_format == "C":
return self._dataset[rowIndices, colIndices]
else:
return self._dataset[colIndices, rowIndices]

# TODO: switch to weak refs at some point
def __del__(self):
self._h5file.close()
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@

from .utils import _check_indices, _slice_h5_sparse, infer_h5_dataset

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"

class H5BackedData:
"""H5 backed matrix or array store.

class H5BackedSparseData:
"""H5 backed sparse matrix or array store.
Args:
path (str): Path to the H5 file.
Expand All @@ -23,8 +27,14 @@ def __init__(self, path: str, group: str) -> None:
self._h5file = h5py.File(path, mode="r")
self._dataset = self._h5file[group]

# TODO: If this gets too complicated, might have to add a
# parameter that specifies the matrix format instead of inferring it
# from the file.
self._dataset_info = infer_h5_dataset(self._dataset)

if self._dataset_info.format not in ["csr_matrix", "csc_matrix"]:
raise ValueError("File does not contain a sparse matrix")

@property
def shape(self) -> Tuple[int, int]:
"""Get shape of the dataset.
Expand Down Expand Up @@ -83,10 +93,6 @@ def __getitem__(
# now slice columns
mat = mat[rowIndices, :]
return mat
elif self.mat_format == "dense":
if colIndices is None:
colIndices = slice(0)
return self._dataset[rowIndices, colIndices]
else:
raise Exception("unknown matrix type in H5.")

Expand Down
3 changes: 2 additions & 1 deletion src/filebackedarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
finally:
del version, PackageNotFoundError

from .backedH5 import H5BackedData
from .H5Sparse import H5BackedSparseData
from .H5Dense import H5BackedDenseData
10 changes: 9 additions & 1 deletion src/filebackedarray/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
import h5py
from scipy import sparse as sp


__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


H5DatasetInfo = namedtuple("H5DatasetInfo", ["shape", "dtype", "format"])


Expand Down Expand Up @@ -46,12 +52,14 @@ def infer_h5_dataset(dataset: h5py.Group, verbose: bool = False) -> H5DatasetInf
print("length of indptr", dataset["indptr"], len(dataset["indptr"]))
if shape[0] == len(dataset["indptr"]) - 1:
format = "csr_matrix"

dtype = dataset["data"].dtype.type
else:
# dense
shape = dataset.shape
format = "dense"
dtype = dataset.dtype.type

dtype = dataset["data"].dtype.type
return H5DatasetInfo(shape, dtype, format)


Expand Down
Binary file added tests/data/dense.h5
Binary file not shown.
Binary file added tests/data/dense_F.h5
Binary file not shown.
33 changes: 33 additions & 0 deletions tests/test_h5_dense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import numpy as np
from filebackedarray import H5BackedDenseData

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def test_h5_dense_backed_C():
assay = H5BackedDenseData("tests/data/dense.h5", "dense_C")

assert assay is not None
assert isinstance(assay, H5BackedDenseData)
assert assay.shape == (100, 100)
assert assay.mat_format == "C"
assert assay.dtype is not None

asy_slice = assay[0:2, 1:4]
assert isinstance(asy_slice, np.ndarray)
assert asy_slice.shape == (2, 3)


def test_h5_dense_backed_F():
assay = H5BackedDenseData("tests/data/dense_F.h5", "dense_F", order="F")

assert assay is not None
assert isinstance(assay, H5BackedDenseData)
assert assay.shape == (3, 2)
assert assay.mat_format == "F"

asy_slice = assay[0:1, 0:2]
assert isinstance(asy_slice, np.ndarray)
assert asy_slice.shape == (2, 1)
10 changes: 5 additions & 5 deletions tests/test_h5_backed.py → tests/test_h5_sparse.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
import scipy.sparse as sp
from filebackedarray import H5BackedData
from filebackedarray import H5BackedSparseData

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def test_h5_backed():
assay = H5BackedData("tests/data/tenx.sub.h5", "matrix")
assay = H5BackedSparseData("tests/data/tenx.sub.h5", "matrix")

assert assay is not None
assert isinstance(assay, H5BackedData)
assert isinstance(assay, H5BackedSparseData)
assert assay.shape == (1000, 3005)
assert assay.mat_format == "csc_matrix"
assert assay.dtype is not None


def test_h5_backed_slice():
assay = H5BackedData("tests/data/tenx.sub.h5", "matrix")
assay = H5BackedSparseData("tests/data/tenx.sub.h5", "matrix")

assert assay is not None
assert isinstance(assay, H5BackedData)
assert isinstance(assay, H5BackedSparseData)
assert assay.shape == (1000, 3005)

asy_slice = assay[0:100, 1:101]
Expand Down

0 comments on commit 5429811

Please sign in to comment.