intermediary work

tidypyverse · Feb 28, 2024 · 52da1e3 · 52da1e3
1 parent 462a3d1
commit 52da1e3
Show file tree

Hide file tree

Showing 7 changed files with 2,921 additions and 908 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## v0.3.1 (24th Jan 2024)
+- Handled deprecation [warnings](https://github.com/tidypyverse/tidypandas/issues/51) created by pandas >= 2.0.0. Use of `group_modify` still produces warning. This is doe to the fact that `include_groups` argument needs to be set to False by default.
+- `unnest` method allows a list of numpy ndarrays (as column to be unnested)
+- `unnest_wider` method is introduced
+- `fill_na` reimplemented to be performant (using Series.groupby.xfill), no user facing change
+
+
 ## v0.3.0 (16th Aug 2023)
 - `tidyselect` is introduced in a few verbs (methods) which support `start_with`, `ends_with` and `contains`.
 - `summarise` now supports returning an iterable within a list as a output, no longer restricts the output to a scalar.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,18 +1,18 @@
 [tool.poetry]
 name = "tidypandas"
-version = "0.3.0"
+version = "0.3.1"
 description = "A grammar of data manipulation for pandas inspired by tidyverse"
 authors = ["Srikanth Komala Sheshachala <[email protected]>", "Ashish Raj <[email protected]>"]
 maintainers = ["Srikanth Komala Sheshachala <[email protected]>", "Ashish Raj <[email protected]>"]
 license = "MIT"
 readme = "README.md"
-homepage = "https://talegari.github.io/tidypandas/"
-repository = "https://github.com/talegari/tidypandas"
-documentation = "https://talegari.github.io/tidypandas/_build/html/autoapi/index.html"
+homepage = "https://tidypyverse.github.io/tidypandas/"
+repository = "https://github.com/tidypyverse/tidypandas"
+documentation = "https://tidypyverse.github.io/tidypandas/_build/html/autoapi/index.html"
 classifiers = ["Development Status :: 4 - Beta"]
 
 [tool.poetry.dependencies]
-python = "^3.8"
+python = "^3.9"
 pandas = ">=1.0.0"
 collections-extended = ">=2.0.2"
 skimpy = {version = ">=0.0.5", optional = true}

diff --git a/src/tidypandas/tidy_accessor.py b/src/tidypandas/tidy_accessor.py
@@ -555,6 +555,38 @@ def unnest(self, nest_column_name = 'data'):
         tf = tidyframe(self._obj, copy = False, check = False)
         return tf.unnest(nest_column_name = nest_column_name).to_pandas(copy = False)
 
+    def unnest_wider(self, nest_column_name, names_sep = None):
+        '''
+        Unnest a column of dicts into multiple columns 
+        
+        Parameters
+        ----------
+        nest_column_name: str
+            Name of the column to be unnested
+        
+        Returns
+        -------
+        tidyframe
+        
+        Notes
+        -----
+        1. unnest_wider is helpful when nested input typically parsed from a
+        a json requires to be unnested.
+        
+        Examples
+        --------
+        >>> from tidypandas.tidy_accessor import tp
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({'x': [1,2]})
+        >>> df['y'] = pd.Series([{"a": 1, 'b': 2}, {'a': 3, 'b': 4}])
+        >>> df.tp.unnest_wider('y')
+        '''
+
+        tf  = tidyframe(self._obj, copy = False, check = True)
+        res = tf.unnest_wider(nest_column_name = nest_column_name,
+                              names_sep = names_sep
+                              )
+        return res.to_pandas(copy = False)
 
     def split(self, by):
         tf = tidyframe(self._obj, copy = False, check = False)

diff --git a/src/tidypandas/tidy_utils.py b/src/tidypandas/tidy_utils.py
@@ -15,9 +15,9 @@
 # simplify
 # -----------------------------------------------------------------------------
 
-def simplify(pdf
-             , sep = "__"
-             , verbose = False
+def simplify(pdf,
+             sep = "__",
+             verbose = False
              ):
     '''
     simplify(pdf)
@@ -184,7 +184,7 @@ def is_simple(pdf, verbose = False):
     
         1. Column names (x.columns) are an unnamed pd.Index object of unique 
            strings. Column names do not start from "_".
-        2. Row names (x.index) is a numeric index (x.indx.is_numeric() is True).
+        2. Row names (x.index) is a numeric index (x.index.is_numeric() is True).
         
     Returns
     -------
@@ -207,8 +207,9 @@ def is_simple(pdf, verbose = False):
     col_flag = not isinstance(pdf.columns, pd.MultiIndex)
 
     # check if row index is numeric
+    from pandas.api.types import is_numeric_dtype
     flag_numeric_index = False
-    if pdf.index.is_numeric():
+    if is_numeric_dtype(pdf.index):
         flag_numeric_index = True 
 
     # check if all column names are strings