Skip to content

Commit

Permalink
Merge pull request #2 from ctdunc/main
Browse files Browse the repository at this point in the history
Add Preliminary ISIN Support
  • Loading branch information
abstractqqq authored Mar 3, 2024
2 parents abfd5ae + 41990f6 commit 35af446
Show file tree
Hide file tree
Showing 9 changed files with 272 additions and 5 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,5 @@ docs/_build/

# Polars Extension
.so
.dll
.dll

7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pyo3 = {version = "*", features = ["extension-module", "abi3-py38"]}
pyo3-polars = {version = "0.12", features = ["derive"]}
polars = {version = "0.38", features = ["performant", "lazy", "nightly", "parquet"]}
iban_validate = "4.0.1"
isin = "0.1.18"

[target.'cfg(any(not(target_os = "linux"), use_mimalloc))'.dependencies]
mimalloc = { version = "0.1", default-features = false }
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ Processing IBAN, URLs, etc., and other standard format data in Polars.

# Acknowledgements

1. Iban is powered by [iban_validate](https://crates.io/crates/iban_validate)
1. Iban is powered by [iban_validate](https://crates.io/crates/iban_validate)
2. Isin is powered by [isin_rs](https://docs.rs/isin/latest/isin/)
4 changes: 2 additions & 2 deletions python/polars_istr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .iban import IbanExt # noqa: E402

from .isin import IsinExt # noqa: E402
__version__ = "0.1.0"
__all__ = ["IbanExt"]
__all__ = ["IbanExt", "IsinExt"]
74 changes: 74 additions & 0 deletions python/polars_istr/isin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations
import polars as pl
from polars.utils.udfs import _get_shared_lib_location

_lib = _get_shared_lib_location(__file__)

@pl.api.register_expr_namespace("isin")
class IsinExt:

"""
This class contains tools for parsing ISIN format data.
Polars Namespace: isin
Example: pl.col("isin_str").isin.country_code()
"""

def __init__(self, expr: pl.Expr):
self._expr: pl.Expr = expr

def country_code(self) -> pl.Expr:
"""
Returns country code from the ISIN, or null if it cannot be parsed.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_isin_country_code",
is_elementwise=True,
)

def check_digit(self) -> pl.Expr:
"""
Returns check digits from the ISIN, or null if it cannot be parsed.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_isin_check_digit",
is_elementwise=True,
)

def security_id(self) -> pl.Expr:
"""
Returns the 9-digit security identifier of the ISIN, or null if it cannot
be parsed.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_isin_security_id",
is_elementwise=True,
)

def is_valid(self) -> pl.Expr:
"""
Returns a boolean indicating whether the string is a valid ISIN string.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_isin_valid",
is_elementwise=True,
)

def extract_all(self) -> pl.Expr:
"""
Returns all information from ISIN and return as a struct. Empty string means the part cannot
be extracted. Running this can be faster than running the corresponding single queries together.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_isin_full",
is_elementwise=True,
)



129 changes: 129 additions & 0 deletions src/isin_parsing/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
use isin;
use polars::prelude::*;
use pyo3_polars::derive::polars_expr;

fn isin_full_output(_: &[Field]) -> PolarsResult<Field> {
let cc = Field::new("country_code", DataType::String);
let id = Field::new("security_id", DataType::String);
let cd = Field::new("check_digit", DataType::String);

let v: Vec<Field> = vec![cc, id, cd];
Ok(Field::new("", DataType::Struct(v)))
}

#[polars_expr(output_type_func=isin_full_output)]
fn pl_isin_full(inputs: &[Series]) -> PolarsResult<Series> {
let ca = inputs[0].str()?;
let mut cc_builder = StringChunkedBuilder::new("country_code", ca.len());
let mut id_builder = StringChunkedBuilder::new("security_id", ca.len());
let mut cd_builder = StringChunkedBuilder::new("check_digit", ca.len());

ca.into_iter().for_each(|op_s| {
if let Some(s) = op_s {
if let Ok(isin) = isin::parse(s) {
cc_builder.append_value(isin.prefix());
id_builder.append_value(isin.basic_code());
cd_builder.append_value(isin.check_digit().to_string());
} else {
cc_builder.append_null();
id_builder.append_null();
cd_builder.append_null();
}
} else {
cc_builder.append_null();
id_builder.append_null();
cd_builder.append_null();
}
});

let cc = cc_builder.finish().into_series();
let id = id_builder.finish().into_series();
let cd = cd_builder.finish().into_series();

let out = StructChunked::new("isin", &[cc, id, cd])?;
Ok(out.into_series())
}

#[polars_expr(output_type=String)]
fn pl_isin_country_code(inputs: &[Series]) -> PolarsResult<Series> {
let ca = inputs[0].str()?;

let mut builder = StringChunkedBuilder::new("country_code", ca.len());

ca.into_iter().for_each(|op_s| {
if let Some(s) = op_s {
if let Ok(isin) = isin::parse(s) {
builder.append_value(isin.prefix());
} else {
builder.append_null();
}
} else {
builder.append_null();
}
});

let out = builder.finish();
Ok(out.into_series())
}

#[polars_expr(output_type=String)]
fn pl_isin_security_id(inputs: &[Series]) -> PolarsResult<Series> {
let ca = inputs[0].str()?;

let mut builder = StringChunkedBuilder::new("security_id", ca.len());

ca.into_iter().for_each(|op_s| {
if let Some(s) = op_s {
if let Ok(isin) = isin::parse(s) {
builder.append_value(isin.basic_code());
} else {
builder.append_null();
}
} else {
builder.append_null();
}
});

let out = builder.finish();
Ok(out.into_series())
}

#[polars_expr(output_type=String)]
fn pl_isin_check_digit(inputs: &[Series]) -> PolarsResult<Series> {
let ca = inputs[0].str()?;

let mut builder = StringChunkedBuilder::new("check_digit", ca.len());

ca.into_iter().for_each(|op_s| {
if let Some(s) = op_s {
if let Ok(isin) = isin::parse(s) {
builder.append_value(isin.check_digit().to_string());
} else {
builder.append_null();
}
} else {
builder.append_null();
}
});

let out = builder.finish();
Ok(out.into_series())
}

#[polars_expr(output_type=Boolean)]
fn pl_isin_valid(inputs: &[Series]) -> PolarsResult<Series> {
let ca = inputs[0].str()?;

let mut builder = BooleanChunkedBuilder::new("isin_valid", ca.len());

ca.into_iter().for_each(|op_s| {
if let Some(s) = op_s {
builder.append_value(isin::validate(s));
} else {
builder.append_value(false);
}
});

let out = builder.finish();
Ok(out.into_series())
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod iban_parsing;
mod isin_parsing;
mod utils;
use pyo3::{pymodule, types::PyModule, PyResult, Python};

Expand Down
55 changes: 54 additions & 1 deletion tests/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from polars.testing import assert_frame_equal
from typing import List


@pytest.mark.parametrize(
"df, cc, cd, reason, is_valid, bban, bank_id, branch_id",
[
Expand Down Expand Up @@ -81,3 +80,57 @@ def test_iban1(

assert_frame_equal(test1, ans)
assert_frame_equal(test2, ans)

@pytest.mark.parametrize(
"df, cc, cd, sec_id, is_valid",
[
(
pl.DataFrame(
{
"isin": [
"US0378331005", # AAPL
"US0378331008", # AAPL w/ bad check digit
"US037833100", # AAPL w/o check digit
"CA00206RGB20", # Canadian
"XS1550212416", # Other
None,
]
}
),
["US", None, None, "CA", "XS",None],
["5", None, None, "0", "6", None],
["037833100", None, None, "00206RGB2", "155021241", None],
[True, False, False, True, True, False]
)
],
)
def test_isin1(
df: pl.DataFrame,
cc: List[str],
cd: List[str],
sec_id: List[str],
is_valid: List[str],
):
test1 = df.select(
pl.col("isin").isin.country_code().alias("country_code"),
pl.col("isin").isin.check_digit().alias("check_digit"),
pl.col("isin").isin.security_id().alias("security_id"),
pl.col("isin").isin.is_valid().alias("is_valid"),
)
test2 = df.lazy().select(
pl.col("isin").isin.country_code().alias("country_code"),
pl.col("isin").isin.check_digit().alias("check_digit"),
pl.col("isin").isin.security_id().alias("security_id"),
pl.col("isin").isin.is_valid().alias("is_valid"),
).collect()

ans = pl.DataFrame({
"country_code": cc,
"check_digit": cd,
"security_id": sec_id,
"is_valid": is_valid,
})

assert_frame_equal(test1, ans)
assert_frame_equal(test2, ans)

0 comments on commit 35af446

Please sign in to comment.