From acc93c481e87489131635fdabfc1acab9a8aa2e9 Mon Sep 17 00:00:00 2001 From: abstractqqq Date: Sun, 3 Mar 2024 22:00:43 -0500 Subject: [PATCH 1/2] added basic url --- Cargo.lock | 133 +++++++++++++++++++++--------- Cargo.toml | 3 +- README.md | 3 +- examples/basics.ipynb | 84 ++++++++++++++++++- python/polars_istr/__init__.py | 4 +- python/polars_istr/iban.py | 2 +- python/polars_istr/isin.py | 10 +-- python/polars_istr/url.py | 100 +++++++++++++++++++++++ src/iban_parsing/mod.rs | 6 +- src/isin_parsing/mod.rs | 2 +- src/lib.rs | 1 + src/url_parsing/mod.rs | 144 +++++++++++++++++++++++++++++++++ src/utils/mod.rs | 34 -------- tests/test_correctness.py | 118 ++++++++++++++++++++++++--- 14 files changed, 547 insertions(+), 97 deletions(-) create mode 100644 python/polars_istr/url.py create mode 100644 src/url_parsing/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 041680f..04924ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -458,6 +458,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futures" version = "0.3.30" @@ -642,6 +651,16 @@ dependencies = [ "arrayvec", ] +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "indexmap" version = "2.1.0" @@ -971,9 +990,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" @@ -1042,9 +1061,9 @@ dependencies = [ [[package]] name = "polars" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7588a5b3b6044f3bf512163390a10641c0a6836dd3a72a5fffd7ff28fd63d61" +checksum = "3a26ef94cfedd5915da990a0b4740cca17b5854bd44a8e8c741fe732c02aac37" dependencies = [ "getrandom", "polars-arrow", @@ -1062,9 +1081,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111e21876bbf22958717b42b0422afb2b029b4360def55d5a955090b4bc2f9f" +checksum = "48e71d30a9fa503bc3baaff3b4c48f08d402c442a50ea7fb9d475ce7b575425a" dependencies = [ "ahash", "atoi", @@ -1108,9 +1127,9 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "491e9ba0cb71df57a5b9be2886274cae7d627f4bd4bd50721293405e8272d232" +checksum = "26342dea46502e8a3322f484062869c2fa49185d512bce4fb44f350b559b4eae" dependencies = [ "bytemuck", "either", @@ -1124,9 +1143,9 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd0697dd64322f6b05904f2c062bbd132dbb80ab43a3b6ae651d9fefdc8fa9f" +checksum = "99e0885a8f1bd1f4d928f5eaa852825bf647b6b5e21e171b6af838f77b6565f3" dependencies = [ "ahash", "bitflags 2.4.1", @@ -1156,9 +1175,9 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1b1d0dfdde15c96219768ff6bbea4e3e208d99fb7ac469ed8298b9d2420dd3" +checksum = "d259d905c17d8e8b2de1eadc94dc4186bf1d325f1be81b4087afea22a6f753d6" dependencies = [ "polars-arrow-format", "regex", @@ -1178,9 +1197,9 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e182a57bf534af715670d8113426f9899891f9bf04c5d5f05afcef9ec431d72e" +checksum = "45f694b918ba2ee7e6f13e8415598f94009c390a9e61c95e6b9c26c8fe1a1a54" dependencies = [ "ahash", "async-trait", @@ -1213,9 +1232,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26e20ec904cae9edc88a9d472918125bd2e6018c9bd4ba532144067b5985b7c8" +checksum = "56e3b40272d24142bcecb2979b19ec8d8c1a14036cb3cea09ce8fb8a4a43bcde" dependencies = [ "ahash", "bitflags 2.4.1", @@ -1236,9 +1255,9 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93cf6f38ce1ad766c1b3e37b5c4910818e4cbfc692cfee1dfcca527221a74fb6" +checksum = "cd5bad61c2fa1977eb65bb719f12d4f68b908edf1106b91b3ab9615f9df8843e" dependencies = [ "ahash", "argminmax", @@ -1266,9 +1285,9 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd23a03bcc64270734cec1cc52305a7613cd8b8b74cb050b547614a6325d5446" +checksum = "06d84fb9b005a19ca523406df371d9329466ae87df48922d0d3d8955072502a4" dependencies = [ "ahash", "async-stream", @@ -1292,9 +1311,9 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0677b3c7ebbf47094de3a6649e6bad94deb7aa153782009bd7ae307718e274f1" +checksum = "58097bef7208a5b833c4d832d948026854917b4a219d55ab1779eb36b59fac0f" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1317,9 +1336,9 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eea6273d49a3fc223d533582cb2b682fb7f26cdff7848b919fa51fa62a9385da" +checksum = "56493c0e13aaccfcae59985db34da30cd4893e57edc9715d8688c96d7e911d47" dependencies = [ "ahash", "bytemuck", @@ -1342,9 +1361,9 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebb7331549d15fe84ea829b95fb325aabb7d7b3977b402859dc8008d36163c96" +checksum = "7def6f9fc14fbfc0550bad615a757f3e1d86c00983c5ff23166fcdf205438d51" dependencies = [ "bytemuck", "polars-arrow", @@ -1354,9 +1373,9 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8523d71083965b5b07b7f7b97606b9f89ee54b3fcaf8e0edda92a190f2ece6a7" +checksum = "4f9d7de9dca8170a20b6c4cb7bafaf724abe88e807646bc3c2e98f13a34a7c4c" dependencies = [ "hex", "polars-arrow", @@ -1372,9 +1391,9 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecb651345ea747a22c1960fe15bb3dc7de812e9e4dbf1874f8546a588c1598d" +checksum = "162c815c3cb0f859da40f056c8a0a9c4247900e1275702ae399192ea60acac2a" dependencies = [ "atoi", "chrono", @@ -1392,9 +1411,9 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2cbba48cee3aa2633a29a8f07290c2f1a8036dcfb2588091b5258db84fdb876" +checksum = "7fca7938ee789314ac92a0bf6c1c4e5eaeb5e428241df2519fd70f21dba49194" dependencies = [ "ahash", "bytemuck", @@ -1421,6 +1440,7 @@ dependencies = [ "polars", "pyo3", "pyo3-polars", + "url", ] [[package]] @@ -1590,9 +1610,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" dependencies = [ "either", "rayon-core", @@ -1600,9 +1620,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -1897,6 +1917,21 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.36.0" @@ -1926,12 +1961,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-reverse" version = "1.0.8" @@ -1959,6 +2009,17 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "uuid" version = "1.7.0" diff --git a/Cargo.toml b/Cargo.toml index 040eb29..f78bf1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,9 +12,10 @@ crate-type = ["cdylib"] [dependencies] pyo3 = {version = "*", features = ["extension-module", "abi3-py38"]} pyo3-polars = {version = "0.12", features = ["derive"]} -polars = {version = "0.38", features = ["performant", "lazy", "nightly", "parquet"]} +polars = {version = "0.38.1", features = ["performant", "lazy", "nightly", "parquet"]} iban_validate = "4.0.1" isin = "0.1.18" +url = "2.5.0" [target.'cfg(any(not(target_os = "linux"), use_mimalloc))'.dependencies] mimalloc = { version = "0.1", default-features = false } diff --git a/README.md b/README.md index 7d82ffa..d67e060 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,10 @@ # The Project -Processing IBAN, URLs, etc., and other standard format data in Polars. +Processing IBAN, ISINs, URLs, etc., and other standard format data in Polars. # Acknowledgements 1. Iban is powered by [iban_validate](https://crates.io/crates/iban_validate) 2. Isin is powered by [isin_rs](https://docs.rs/isin/latest/isin/) +3. URL is powered by [url](https://crates.io/crates/url) \ No newline at end of file diff --git a/examples/basics.ipynb b/examples/basics.ipynb index 06dea49..df13d8d 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -266,10 +266,92 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "ba5a84bb", + "metadata": {}, + "source": [ + "# URL" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "86bd1835", + "metadata": {}, + "outputs": [], + "source": [ + "df = pl.DataFrame(\n", + " {\n", + " \"url\": [\n", + " \"https://example.com/data.csv#row=4\",\n", + " \"google.com\", \n", + " \"ww.google.com\", \n", + " \"abc123@email.com\", \n", + " \"https://127.0.0.1/\", \n", + " \"https://example.com/\",\n", + " \"file:///tmp/foo\",\n", + " \"https://example.com/products?page=2&sort=desc\",\n", + " None,\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6d9a2823", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (8, 1)
url
str
"example.com"
null
null
null
"127.0.0.1"
"example.com"
null
null
" + ], + "text/plain": [ + "shape: (8, 1)\n", + "┌─────────────┐\n", + "│ url │\n", + "│ --- │\n", + "│ str │\n", + "╞═════════════╡\n", + "│ example.com │\n", + "│ null │\n", + "│ null │\n", + "│ null │\n", + "│ 127.0.0.1 │\n", + "│ example.com │\n", + "│ null │\n", + "│ null │\n", + "└─────────────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(\n", + " pl.col(\"url\").url.host().alias(\"host\"),\n", + " pl.col(\"url\").url.domain().alias(\"domain\"),\n", + " pl.col(\"url\").url.host().alias(\"\"),\n", + " pl.col(\"url\").url.host().alias(\"host\"),\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "9d5b6cbc", + "id": "385d3a3d", "metadata": {}, "outputs": [], "source": [] diff --git a/python/polars_istr/__init__.py b/python/polars_istr/__init__.py index d9256a3..f2b0d5f 100644 --- a/python/polars_istr/__init__.py +++ b/python/polars_istr/__init__.py @@ -1,4 +1,6 @@ from .iban import IbanExt # noqa: E402 from .isin import IsinExt # noqa: E402 +from .url import UrlExt # noqa: E402 + __version__ = "0.1.0" -__all__ = ["IbanExt", "IsinExt"] +__all__ = ["IbanExt", "IsinExt", "UrlExt"] diff --git a/python/polars_istr/iban.py b/python/polars_istr/iban.py index 29c9d84..029aae6 100644 --- a/python/polars_istr/iban.py +++ b/python/polars_istr/iban.py @@ -77,7 +77,7 @@ def is_valid(self) -> pl.Expr: """ return self._expr.register_plugin( lib=_lib, - symbol="pl_iban_valid", + symbol="pl_iban_is_valid", is_elementwise=True, ) diff --git a/python/polars_istr/isin.py b/python/polars_istr/isin.py index 605f1d2..3457be4 100644 --- a/python/polars_istr/isin.py +++ b/python/polars_istr/isin.py @@ -4,13 +4,14 @@ _lib = _get_shared_lib_location(__file__) + @pl.api.register_expr_namespace("isin") class IsinExt: """ This class contains tools for parsing ISIN format data. - Polars Namespace: isin + Polars Namespace: isin Example: pl.col("isin_str").isin.country_code() """ @@ -37,7 +38,7 @@ def check_digit(self) -> pl.Expr: symbol="pl_isin_check_digit", is_elementwise=True, ) - + def security_id(self) -> pl.Expr: """ Returns the 9-digit security identifier of the ISIN, or null if it cannot @@ -55,7 +56,7 @@ def is_valid(self) -> pl.Expr: """ return self._expr.register_plugin( lib=_lib, - symbol="pl_isin_valid", + symbol="pl_isin_is_valid", is_elementwise=True, ) @@ -69,6 +70,3 @@ def extract_all(self) -> pl.Expr: symbol="pl_isin_full", is_elementwise=True, ) - - - diff --git a/python/polars_istr/url.py b/python/polars_istr/url.py new file mode 100644 index 0000000..b27ec6e --- /dev/null +++ b/python/polars_istr/url.py @@ -0,0 +1,100 @@ +from __future__ import annotations +import polars as pl +from polars.utils.udfs import _get_shared_lib_location + +_lib = _get_shared_lib_location(__file__) + + +@pl.api.register_expr_namespace("url") +class UrlExt: + + """ + This class contains tools for parsing URL strings. + + Polars Namespace: url + + Example: pl.col("url_str").url.query() + """ + + def __init__(self, expr: pl.Expr): + self._expr: pl.Expr = expr + + def is_special(self) -> pl.Expr: + """ + Returns a boolean indicating whether the URL has a special scheme or not. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_is_special", + is_elementwise=True, + ) + + def host(self) -> pl.Expr: + """ + Returns the host of the URL, if possible. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_host", + is_elementwise=True, + ) + + def path(self) -> pl.Expr: + """ + Returns the path part of the URL, if possible. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_path", + is_elementwise=True, + ) + + def domain(self) -> pl.Expr: + """ + Returns the domain of the URL, if possible. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_domain", + is_elementwise=True, + ) + + def fragment(self) -> pl.Expr: + """ + Returns the fragment of the URL, if possible. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_fragment", + is_elementwise=True, + ) + + def query(self) -> pl.Expr: + """ + Returns the query part of the URL, if possible. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_query", + is_elementwise=True, + ) + + def is_valid(self) -> pl.Expr: + """ + Returns a boolean indicating whether the string is a valid URL string. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_is_valid", + is_elementwise=True, + ) + + def check(self) -> pl.Expr: + """ + Returns a string that explains whether the URL string is valid or not. + """ + return self._expr.register_plugin( + lib=_lib, + symbol="pl_url_check", + is_elementwise=True, + ) diff --git a/src/iban_parsing/mod.rs b/src/iban_parsing/mod.rs index ad5e8e8..8d38116 100644 --- a/src/iban_parsing/mod.rs +++ b/src/iban_parsing/mod.rs @@ -159,7 +159,7 @@ fn pl_iban_bban(inputs: &[Series]) -> PolarsResult { } #[polars_expr(output_type=Boolean)] -fn pl_iban_valid(inputs: &[Series]) -> PolarsResult { +fn pl_iban_is_valid(inputs: &[Series]) -> PolarsResult { let ca = inputs[0].str()?; let out: BooleanChunked = ca.apply_values_generic(|s| s.parse::().is_ok()); Ok(out.into_series()) @@ -169,7 +169,7 @@ fn pl_iban_valid(inputs: &[Series]) -> PolarsResult { fn pl_iban_check(inputs: &[Series]) -> PolarsResult { let ca = inputs[0].str()?; let out = ca.apply_to_buffer(|s, buf| { - let s = match Iban::from_str(s) { + let ss = match Iban::from_str(s) { Ok(_) => "ok".to_string(), Err(e) => match e { iban::ParseIbanError::InvalidBaseIban { source } => match source { @@ -182,7 +182,7 @@ fn pl_iban_check(inputs: &[Series]) -> PolarsResult { iban::ParseIbanError::UnknownCountry(_) => "Invalid country code".to_string(), }, }; - write!(buf, "{}", s).unwrap() + write!(buf, "{}", ss).unwrap() }); Ok(out.into_series()) } diff --git a/src/isin_parsing/mod.rs b/src/isin_parsing/mod.rs index f05b8ad..553a7f5 100644 --- a/src/isin_parsing/mod.rs +++ b/src/isin_parsing/mod.rs @@ -111,7 +111,7 @@ fn pl_isin_check_digit(inputs: &[Series]) -> PolarsResult { } #[polars_expr(output_type=Boolean)] -fn pl_isin_valid(inputs: &[Series]) -> PolarsResult { +fn pl_isin_is_valid(inputs: &[Series]) -> PolarsResult { let ca = inputs[0].str()?; let mut builder = BooleanChunkedBuilder::new("isin_valid", ca.len()); diff --git a/src/lib.rs b/src/lib.rs index 6f94f7d..9dab5b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ mod iban_parsing; mod isin_parsing; +mod url_parsing; mod utils; use pyo3::{pymodule, types::PyModule, PyResult, Python}; diff --git a/src/url_parsing/mod.rs b/src/url_parsing/mod.rs new file mode 100644 index 0000000..9b054c7 --- /dev/null +++ b/src/url_parsing/mod.rs @@ -0,0 +1,144 @@ +use polars::prelude::*; +use pyo3_polars::derive::polars_expr; +use std::fmt::Write; +use url::Url; + +#[polars_expr(output_type=String)] +fn pl_url_host(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let mut builder = StringChunkedBuilder::new("host", ca.len()); + + ca.into_iter().for_each(|op_s| { + if let Some(s) = op_s { + if let Ok(u) = Url::parse(s) { + builder.append_option(u.host_str()) + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + }); + let out = builder.finish(); + Ok(out.into_series()) +} + +#[polars_expr(output_type=String)] +fn pl_url_domain(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let mut builder = StringChunkedBuilder::new("domain", ca.len()); + + ca.into_iter().for_each(|op_s| { + if let Some(s) = op_s { + if let Ok(u) = Url::parse(s) { + builder.append_option(u.domain()) + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + }); + let out = builder.finish(); + Ok(out.into_series()) +} + +#[polars_expr(output_type=String)] +fn pl_url_fragment(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let mut builder = StringChunkedBuilder::new("fragment", ca.len()); + + ca.into_iter().for_each(|op_s| { + if let Some(s) = op_s { + if let Ok(u) = Url::parse(s) { + builder.append_option(u.fragment()) + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + }); + let out = builder.finish(); + Ok(out.into_series()) +} + +#[polars_expr(output_type=String)] +fn pl_url_path(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let mut builder = StringChunkedBuilder::new("path", ca.len()); + + ca.into_iter().for_each(|op_s| { + if let Some(s) = op_s { + if let Ok(u) = Url::parse(s) { + builder.append_value(u.path()) + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + }); + let out = builder.finish(); + Ok(out.into_series()) +} + +#[polars_expr(output_type=String)] +fn pl_url_query(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let mut builder = StringChunkedBuilder::new("query", ca.len()); + + ca.into_iter().for_each(|op_s| { + if let Some(s) = op_s { + if let Ok(u) = Url::parse(s) { + builder.append_option(u.query()) + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + }); + let out = builder.finish(); + Ok(out.into_series()) +} + +#[polars_expr(output_type=Boolean)] +fn pl_url_is_special(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let mut builder = BooleanChunkedBuilder::new("is_special", ca.len()); + + ca.into_iter().for_each(|op_s| { + if let Some(s) = op_s { + if let Ok(u) = Url::parse(s) { + builder.append_value(u.is_special()); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + }); + let out = builder.finish(); + Ok(out.into_series()) +} + +#[polars_expr(output_type=Boolean)] +fn pl_url_is_valid(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let out: BooleanChunked = ca.apply_values_generic(|s| s.parse::().is_ok()); + Ok(out.into_series()) +} + +#[polars_expr(output_type=String)] +fn pl_url_check(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].str()?; + let out = ca.apply_to_buffer(|s, buf| { + let ss = match Url::parse(s) { + Ok(_) => "ok".to_string(), + Err(e) => e.to_string(), + }; + write!(buf, "{}", ss).unwrap() + }); + Ok(out.into_series()) +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index e4f98a3..8b13789 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,35 +1 @@ -// use polars::{ -// datatypes::{DataType, Field}, -// error::PolarsResult, -// // frame::DataFrame, -// lazy::dsl::FieldsMapper, -// // series::Series, -// }; -// // ------------------------------------------------------------------------------- -// // Common, Resuable Functions -// // ------------------------------------------------------------------------------- - -// // Shared splitting method -// pub fn split_offsets(len: usize, n: usize) -> Vec<(usize, usize)> { -// if n == 1 { -// vec![(0, len)] -// } else { -// let chunk_size = len / n; -// (0..n) -// .map(|partition| { -// let offset = partition * chunk_size; -// let len = if partition == (n - 1) { -// len - offset -// } else { -// chunk_size -// }; -// (partition * chunk_size, len) -// }) -// .collect() -// } -// } - -// // ------------------------------------------------------------------------------- -// // Common Output Types -// // ------------------------------------------------------------------------------- diff --git a/tests/test_correctness.py b/tests/test_correctness.py index d8e6e48..e0b15bc 100644 --- a/tests/test_correctness.py +++ b/tests/test_correctness.py @@ -6,7 +6,7 @@ import polars_istr # noqa: F401 from polars.testing import assert_frame_equal -from typing import List +from typing import List, Optional @pytest.mark.parametrize( @@ -35,13 +35,13 @@ ) def test_iban1( df: pl.DataFrame, - cc: List[str], - cd: List[str], - reason: List[str], - is_valid: List[bool], - bban: List[str], - bank_id: List[str], - branch_id: List[str], + cc: List[Optional[str]], + cd: List[Optional[str]], + reason: List[Optional[str]], + is_valid: List[Optional[bool]], + bban: List[Optional[str]], + bank_id: List[Optional[str]], + branch_id: List[Optional[str]], ): test1 = df.select( pl.col("iban").iban.country_code().alias("country_code"), @@ -83,6 +83,100 @@ def test_iban1( assert_frame_equal(test2, ans) +@pytest.mark.parametrize( + "df, host, domain, fragment, path, query, check, is_valid, is_special", + [ + ( + pl.DataFrame( + { + "url": [ + "https://example.com/data.csv#row=4", + "google.com", + "ww.google.com", + "abc123@email.com", + "https://127.0.0.1/", + "https://test.com/", + "file:///tmp/foo", + "https://example.com/products?page=2&sort=desc", + None, + ] + } + ), + ["example.com", None, None, None, "127.0.0.1", "test.com", None, "example.com", None], + ["example.com", None, None, None, None, "test.com", None, "example.com", None], + ["row=4", None, None, None, None, None, None, None, None], + ["/data.csv", None, None, None, "/", "/", "/tmp/foo", "/products", None], + [None, None, None, None, None, None, None, "page=2&sort=desc", None], + [ + "ok", + "relative URL without a base", + "relative URL without a base", + "relative URL without a base", + "ok", + "ok", + "ok", + "ok", + None, + ], + [True, False, False, False, True, True, True, True, None], + [True, None, None, None, True, True, True, True, None], + ) + ], +) +def test_url1( + df: pl.DataFrame, + host: List[Optional[str]], + domain: List[Optional[str]], + fragment: List[Optional[str]], + path: List[Optional[str]], + query: List[Optional[str]], + check: List[Optional[str]], + is_valid: List[Optional[bool]], + is_special: List[Optional[bool]], +): + test1 = df.select( + pl.col("url").url.host().alias("host"), + pl.col("url").url.domain().alias("domain"), + pl.col("url").url.fragment().alias("fragment"), + pl.col("url").url.path().alias("path"), + pl.col("url").url.query().alias("query"), + pl.col("url").url.check().alias("check"), + pl.col("url").url.is_valid().alias("is_valid"), + pl.col("url").url.is_special().alias("is_special"), + ) + + test2 = ( + df.lazy() + .select( + pl.col("url").url.host().alias("host"), + pl.col("url").url.domain().alias("domain"), + pl.col("url").url.fragment().alias("fragment"), + pl.col("url").url.path().alias("path"), + pl.col("url").url.query().alias("query"), + pl.col("url").url.check().alias("check"), + pl.col("url").url.is_valid().alias("is_valid"), + pl.col("url").url.is_special().alias("is_special"), + ) + .collect() + ) + + ans = pl.DataFrame( + { + "host": host, + "domain": domain, + "fragment": fragment, + "path": path, + "query": query, + "check": check, + "is_valid": is_valid, + "is_special": is_special, + } + ) + + assert_frame_equal(test1, ans) + assert_frame_equal(test2, ans) + + @pytest.mark.parametrize( "df, cc, cd, sec_id, is_valid", [ @@ -108,10 +202,10 @@ def test_iban1( ) def test_isin1( df: pl.DataFrame, - cc: List[str], - cd: List[str], - sec_id: List[str], - is_valid: List[str], + cc: List[Optional[str]], + cd: List[Optional[str]], + sec_id: List[Optional[str]], + is_valid: List[Optional[str]], ): test1 = df.select( pl.col("isin").isin.country_code().alias("country_code"), From 491e53d0191de9c6cc8bd5b5716f180fc35ef459 Mon Sep 17 00:00:00 2001 From: abstractqqq Date: Sun, 3 Mar 2024 22:02:24 -0500 Subject: [PATCH 2/2] update examples --- examples/basics.ipynb | 62 ++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/examples/basics.ipynb b/examples/basics.ipynb index df13d8d..8577fc2 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -289,7 +289,7 @@ " \"ww.google.com\", \n", " \"abc123@email.com\", \n", " \"https://127.0.0.1/\", \n", - " \"https://example.com/\",\n", + " \"https://test.com/\",\n", " \"file:///tmp/foo\",\n", " \"https://example.com/products?page=2&sort=desc\",\n", " None,\n", @@ -314,24 +314,36 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (8, 1)
url
str
"example.com"
null
null
null
"127.0.0.1"
"example.com"
null
null
" + "shape: (9, 8)
hostdomainfragmentpathquerycheckis_validis_special
strstrstrstrstrstrboolbool
"example.com""example.com""row=4""/data.csv"null"ok"truetrue
nullnullnullnullnull"relative URL w…falsenull
nullnullnullnullnull"relative URL w…falsenull
nullnullnullnullnull"relative URL w…falsenull
"127.0.0.1"nullnull"/"null"ok"truetrue
"test.com""test.com"null"/"null"ok"truetrue
nullnullnull"/tmp/foo"null"ok"truetrue
"example.com""example.com"null"/products""page=2&sort=de…"ok"truetrue
nullnullnullnullnullnullnullnull
" ], "text/plain": [ - "shape: (8, 1)\n", - "┌─────────────┐\n", - "│ url │\n", - "│ --- │\n", - "│ str │\n", - "╞═════════════╡\n", - "│ example.com │\n", - "│ null │\n", - "│ null │\n", - "│ null │\n", - "│ 127.0.0.1 │\n", - "│ example.com │\n", - "│ null │\n", - "│ null │\n", - "└─────────────┘" + "shape: (9, 8)\n", + "┌────────────┬────────────┬──────────┬───────────┬────────────┬────────────┬──────────┬────────────┐\n", + "│ host ┆ domain ┆ fragment ┆ path ┆ query ┆ check ┆ is_valid ┆ is_special │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str ┆ str ┆ str ┆ bool ┆ bool │\n", + "╞════════════╪════════════╪══════════╪═══════════╪════════════╪════════════╪══════════╪════════════╡\n", + "│ example.co ┆ example.co ┆ row=4 ┆ /data.csv ┆ null ┆ ok ┆ true ┆ true │\n", + "│ m ┆ m ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ null ┆ null ┆ null ┆ null ┆ null ┆ relative ┆ false ┆ null │\n", + "│ ┆ ┆ ┆ ┆ ┆ URL ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ without a ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ base ┆ ┆ │\n", + "│ null ┆ null ┆ null ┆ null ┆ null ┆ relative ┆ false ┆ null │\n", + "│ ┆ ┆ ┆ ┆ ┆ URL ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ without a ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ base ┆ ┆ │\n", + "│ null ┆ null ┆ null ┆ null ┆ null ┆ relative ┆ false ┆ null │\n", + "│ ┆ ┆ ┆ ┆ ┆ URL ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ without a ┆ ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ base ┆ ┆ │\n", + "│ 127.0.0.1 ┆ null ┆ null ┆ / ┆ null ┆ ok ┆ true ┆ true │\n", + "│ test.com ┆ test.com ┆ null ┆ / ┆ null ┆ ok ┆ true ┆ true │\n", + "│ null ┆ null ┆ null ┆ /tmp/foo ┆ null ┆ ok ┆ true ┆ true │\n", + "│ example.co ┆ example.co ┆ null ┆ /products ┆ page=2&sor ┆ ok ┆ true ┆ true │\n", + "│ m ┆ m ┆ ┆ ┆ t=desc ┆ ┆ ┆ │\n", + "│ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", + "└────────────┴────────────┴──────────┴───────────┴────────────┴────────────┴──────────┴────────────┘" ] }, "execution_count": 8, @@ -343,18 +355,14 @@ "df.select(\n", " pl.col(\"url\").url.host().alias(\"host\"),\n", " pl.col(\"url\").url.domain().alias(\"domain\"),\n", - " pl.col(\"url\").url.host().alias(\"\"),\n", - " pl.col(\"url\").url.host().alias(\"host\"),\n", + " pl.col(\"url\").url.fragment().alias(\"fragment\"),\n", + " pl.col(\"url\").url.path().alias(\"path\"),\n", + " pl.col(\"url\").url.query().alias(\"query\"),\n", + " pl.col(\"url\").url.check().alias(\"check\"),\n", + " pl.col(\"url\").url.is_valid().alias(\"is_valid\"),\n", + " pl.col(\"url\").url.is_special().alias(\"is_special\"),\n", ")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "385d3a3d", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {