Skip to content

Commit

Permalink
Do not expose encodings in Python (#722)
Browse files Browse the repository at this point in the history
I doubt we will need to write Python code that interacts with specific
encodings. The NumPy and Pandas protocols should all be implemented in
C/Rust anyway.

Anywhere we were using `PyArray::wrap`, I updated to the new PyO3
`Bound` API.
  • Loading branch information
danking committed Sep 3, 2024
1 parent 674a081 commit 06dd5a1
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 197 deletions.
165 changes: 5 additions & 160 deletions pyvortex/src/array.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,6 @@
use paste::paste;
use pyo3::prelude::*;
use vortex::array::{
Bool, BoolArray, BoolEncoding, Chunked, ChunkedArray, ChunkedEncoding, Constant, ConstantArray,
ConstantEncoding, Primitive, PrimitiveArray, PrimitiveEncoding, Sparse, SparseArray,
SparseEncoding, Struct, StructArray, StructEncoding, VarBin, VarBinArray, VarBinEncoding,
VarBinView, VarBinViewArray, VarBinViewEncoding,
};
use vortex::compute::take;
use vortex::encoding::EncodingRef;
use vortex::{Array, ArrayDType, ArrayData, ArrayDef, ToArray};
use vortex_alp::{ALPArray, ALPEncoding, ALP};
use vortex_dict::{Dict, DictArray, DictEncoding};
use vortex_fastlanes::{
BitPacked, BitPackedArray, BitPackedEncoding, Delta, DeltaArray, DeltaEncoding, FoR, FoRArray,
FoREncoding,
};
use vortex_roaring::{
RoaringBool, RoaringBoolArray, RoaringBoolEncoding, RoaringInt, RoaringIntArray,
RoaringIntEncoding,
};
use vortex_runend::{RunEnd, RunEndArray, RunEndEncoding};
use vortex_zigzag::{ZigZag, ZigZagArray, ZigZagEncoding};
use vortex::{Array, ArrayDType};

use crate::dtype::PyDType;
use crate::error::PyVortexError;
Expand All @@ -31,144 +11,9 @@ pub struct PyArray {
inner: Array,
}

macro_rules! pyarray {
($E:ident, $T:ident, $TName:tt) => {
paste! {
#[pyclass(name = $TName, module = "vortex", extends = PyArray, sequence, subclass)]
pub struct [<Py $T>] {
inner: $T,
#[allow(dead_code)]
encoding: EncodingRef,
}

impl [<Py $T>] {
pub fn wrap(py: Python<'_>, inner: $T) -> PyResult<Py<Self>> {
let init = PyClassInitializer::from(PyArray { inner: inner.to_array().clone() })
.add_subclass([<Py $T>] { inner, encoding: &$E });
Py::new(py, init)
}

pub fn unwrap(&self) -> &$T {
&self.inner
}
}
}
};
}

pyarray!(BoolEncoding, BoolArray, "BoolArray");
pyarray!(ChunkedEncoding, ChunkedArray, "ChunkedArray");
pyarray!(ConstantEncoding, ConstantArray, "ConstantArray");
pyarray!(PrimitiveEncoding, PrimitiveArray, "PrimitiveArray");
pyarray!(SparseEncoding, SparseArray, "SparseArray");
pyarray!(StructEncoding, StructArray, "StructArray");
pyarray!(VarBinEncoding, VarBinArray, "VarBinArray");
pyarray!(VarBinViewEncoding, VarBinViewArray, "VarBinViewArray");

pyarray!(ALPEncoding, ALPArray, "ALPArray");
pyarray!(BitPackedEncoding, BitPackedArray, "BitPackedArray");
pyarray!(FoREncoding, FoRArray, "FoRArray");
pyarray!(DeltaEncoding, DeltaArray, "DeltaArray");
pyarray!(DictEncoding, DictArray, "DictArray");
pyarray!(RunEndEncoding, RunEndArray, "RunEndArray");
pyarray!(RoaringBoolEncoding, RoaringBoolArray, "RoaringBoolArray");
pyarray!(RoaringIntEncoding, RoaringIntArray, "RoaringIntArray");
pyarray!(ZigZagEncoding, ZigZagArray, "ZigZagArray");

impl PyArray {
pub fn wrap(py: Python<'_>, inner: ArrayData) -> PyResult<Py<Self>> {
let encoding_id = inner.encoding().id();
let array = Array::from(inner);
// This is the one place where we'd want to have owned kind enum but there's no other place this is used
match encoding_id {
Bool::ID => PyBoolArray::wrap(
py,
BoolArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Chunked::ID => PyChunkedArray::wrap(
py,
ChunkedArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Constant::ID => PyConstantArray::wrap(
py,
ConstantArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Primitive::ID => PyPrimitiveArray::wrap(
py,
PrimitiveArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Sparse::ID => PySparseArray::wrap(
py,
SparseArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Struct::ID => PyStructArray::wrap(
py,
StructArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
VarBin::ID => PyVarBinArray::wrap(
py,
VarBinArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
VarBinView::ID => PyVarBinViewArray::wrap(
py,
VarBinViewArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Dict::ID => PyDictArray::wrap(
py,
DictArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
RunEnd::ID => PyRunEndArray::wrap(
py,
RunEndArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
Delta::ID => PyDeltaArray::wrap(
py,
DeltaArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
FoR::ID => PyFoRArray::wrap(
py,
FoRArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
BitPacked::ID => PyBitPackedArray::wrap(
py,
BitPackedArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),

ALP::ID => PyALPArray::wrap(
py,
ALPArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
RoaringBool::ID => PyBitPackedArray::wrap(
py,
BitPackedArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
RoaringInt::ID => PyBitPackedArray::wrap(
py,
BitPackedArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
ZigZag::ID => PyZigZagArray::wrap(
py,
ZigZagArray::try_from(array).map_err(PyVortexError::map_err)?,
)?
.extract(py),
_ => Py::new(py, Self { inner: array }),
}
pub fn new(inner: Array) -> PyArray {
PyArray { inner }
}

pub fn unwrap(&self) -> &Array {
Expand Down Expand Up @@ -205,9 +50,9 @@ impl PyArray {
PyDType::wrap(self_.py(), self_.inner.dtype().clone())
}

fn take(&self, indices: PyRef<'_, Self>) -> PyResult<Py<Self>> {
fn take<'py>(&self, indices: PyRef<'py, Self>) -> PyResult<Bound<'py, PyArray>> {
take(&self.inner, indices.unwrap())
.map_err(PyVortexError::map_err)
.and_then(|arr| Self::wrap(indices.py(), arr.into()))
.and_then(|arr| Bound::new(indices.py(), PyArray { inner: arr }))
}
}
2 changes: 1 addition & 1 deletion pyvortex/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ pub fn compress(
let compressed = py
.allow_threads(|| ctx.compress(arr.unwrap(), None))
.map_err(PyVortexError::map_err)?;
PyArray::wrap(py, compressed)
Bound::new(array.py(), PyArray::new(inner))
}
26 changes: 15 additions & 11 deletions pyvortex/src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use vortex::array::ChunkedArray;
use vortex::arrow::{FromArrowArray, FromArrowType};
use vortex::{Array, ToArrayData};
use vortex::{Array, IntoArray};
use vortex_dtype::DType;

use crate::array::PyArray;
Expand All @@ -17,7 +17,7 @@ use crate::vortex_arrow::map_arrow_err;
/// The main entry point for creating enc arrays from other Python objects.
///
#[pyfunction]
pub fn encode(obj: &Bound<PyAny>) -> PyResult<Py<PyArray>> {
pub fn encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult<Bound<'py, PyArray>> {
let pa = obj.py().import_bound("pyarrow")?;
let pa_array = pa.getattr("Array")?;
let chunked_array = pa.getattr("ChunkedArray")?;
Expand All @@ -26,7 +26,7 @@ pub fn encode(obj: &Bound<PyAny>) -> PyResult<Py<PyArray>> {
if obj.is_instance(&pa_array)? {
let arrow_array = ArrowArrayData::from_pyarrow_bound(obj).map(make_array)?;
let enc_array = Array::from_arrow(arrow_array, false);
PyArray::wrap(obj.py(), enc_array.into())
Bound::new(obj.py(), PyArray::new(enc_array))
} else if obj.is_instance(&chunked_array)? {
let chunks: Vec<Bound<PyAny>> = obj.getattr("chunks")?.extract()?;
let encoded_chunks = chunks
Expand All @@ -41,11 +41,13 @@ pub fn encode(obj: &Bound<PyAny>) -> PyResult<Py<PyArray>> {
.getattr("type")
.and_then(|v| DataType::from_pyarrow_bound(&v))
.map(|dt| DType::from_arrow(&Field::new("_", dt, false)))?;
PyArray::wrap(
Bound::new(
obj.py(),
ChunkedArray::try_new(encoded_chunks, dtype)
.map_err(PyVortexError::map_err)?
.to_array_data(),
PyArray::new(
ChunkedArray::try_new(encoded_chunks, dtype)
.map_err(PyVortexError::map_err)?
.into_array(),
),
)
} else if obj.is_instance(&table)? {
let array_stream = ArrowArrayStreamReader::from_pyarrow_bound(obj)?;
Expand All @@ -54,11 +56,13 @@ pub fn encode(obj: &Bound<PyAny>) -> PyResult<Py<PyArray>> {
.into_iter()
.map(|b| b.map(Array::from).map_err(map_arrow_err))
.collect::<PyResult<Vec<_>>>()?;
PyArray::wrap(
Bound::new(
obj.py(),
ChunkedArray::try_new(chunks, dtype)
.map_err(PyVortexError::map_err)?
.to_array_data(),
PyArray::new(
ChunkedArray::try_new(chunks, dtype)
.map_err(PyVortexError::map_err)?
.into_array(),
),
)
} else {
Err(PyValueError::new_err("Cannot convert object to enc array"))
Expand Down
21 changes: 0 additions & 21 deletions pyvortex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use vortex_dtype::{DType, PType};

use crate::array::*;

mod array;
mod dtype;
mod encode;
Expand All @@ -21,25 +19,6 @@ fn _lib(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(encode::encode, m)?)?;
// m.add_function(wrap_pyfunction!(compress::compress, m)?)?;

m.add_class::<PyArray>()?;
m.add_class::<PyBoolArray>()?;
m.add_class::<PyBitPackedArray>()?;
m.add_class::<PyChunkedArray>()?;
m.add_class::<PyConstantArray>()?;
m.add_class::<PyDeltaArray>()?;
m.add_class::<PyDictArray>()?;
m.add_class::<PyFoRArray>()?;
m.add_class::<PyPrimitiveArray>()?;
m.add_class::<PyRunEndArray>()?;
m.add_class::<PyRoaringBoolArray>()?;
m.add_class::<PyRoaringIntArray>()?;
m.add_class::<PySparseArray>()?;
m.add_class::<PyStructArray>()?;
m.add_class::<PyVarBinArray>()?;
m.add_class::<PyVarBinViewArray>()?;
m.add_class::<PyZigZagArray>()?;
m.add_class::<PyALPArray>()?;

m.add_class::<PyDType>()?;

m.add_function(wrap_pyfunction!(dtype_int, m)?)?;
Expand Down
2 changes: 0 additions & 2 deletions pyvortex/test/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
def test_primitive_array_round_trip():
a = pa.array([0, 1, 2, 3])
arr = vortex.encode(a)
assert isinstance(arr, vortex.PrimitiveArray)
assert arr.to_arrow().combine_chunks() == a


def test_varbin_array_round_trip():
a = pa.array(["a", "b", "c"])
arr = vortex.encode(a)
assert isinstance(arr, vortex.VarBinArray)
assert arr.to_arrow().combine_chunks() == a


Expand Down
2 changes: 0 additions & 2 deletions pyvortex/test/test_compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def test_zigzag_encode():
def test_chunked_encode():
chunked = pa.chunked_array([pa.array([0, 1, 2]), pa.array([3, 4, 5])])
encoded = vortex.encode(chunked)
assert isinstance(encoded, vortex.ChunkedArray)
assert encoded.to_arrow().combine_chunks() == pa.array([0, 1, 2, 3, 4, 5])


Expand All @@ -71,7 +70,6 @@ def test_table_encode():
}
)
encoded = vortex.encode(table)
assert isinstance(encoded, vortex.ChunkedArray)
assert encoded.to_arrow().combine_chunks() == pa.StructArray.from_arrays(
[pa.array([0, 1, 2, 3, 4, 5]), pa.array(["a", "b", "c", "d", "e", "f"])], names=["number", "string"]
)
Expand Down

0 comments on commit 06dd5a1

Please sign in to comment.