From ec28e789c2b69b54d36caa3459b389c182bd81c0 Mon Sep 17 00:00:00 2001
From: "Zheng, Yuhang" <YZheng83@MassMutual.com>
Date: Wed, 31 Jul 2019 14:48:24 -0400
Subject: [PATCH 1/6] add extra dtype attribute to define the dtype of
 dataframe

---
 impyute/dataset/corrupt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py
index 33d1b8b..0b75705 100644
--- a/impyute/dataset/corrupt.py
+++ b/impyute/dataset/corrupt.py
@@ -23,10 +23,10 @@ class Corruptor:
         Overwrite values with MNAR placed NaN's.
 
     """
-    def __init__(self, data, thr=0.2):
+    def __init__(self, data, thr=0.2, dtype=np.float):
         self.dtype = data.dtype
         self.shape = np.shape(data)
-        self.data = data.astype(np.float)
+        self.data = data.astype(dtype)
         self.thr = thr
 
     def mcar(self):

From c10a4feb6ab4bf5af4a9a33f8078c5b3af491823 Mon Sep 17 00:00:00 2001
From: "Zheng, Yuhang" <YZheng83@MassMutual.com>
Date: Wed, 31 Jul 2019 14:49:13 -0400
Subject: [PATCH 2/6] add randc function to randomly generate dataset with
 categorical data

---
 impyute/dataset/base.py | 57 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py
index f492a8e..5f2fe9c 100644
--- a/impyute/dataset/base.py
+++ b/impyute/dataset/base.py
@@ -1,6 +1,11 @@
 """ Shared functions to load/generate data """
 import numpy as np
+import string
+import random
+import math
+import itertools
 from impyute.dataset.corrupt import Corruptor
+from impyute.util import BadInputError
 
 def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
     """ Return randomly generated dataset of numbers with uniformly
@@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int")
     shape:tuple(optional)
         Size of the randomly generated data
     missingness: ('mcar', 'mar', 'mnar')
-        Type of missigness you want in your dataset
-    th: float between [0,1]
+        Type of missingness you want in your dataset
+    thr: float between [0,1]
         Percentage of missing data in generated data
     dtype: ('int','float')
         Type of data
@@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
     shape:tuple(optional)
         Size of the randomly generated data
     missingness: ('mcar', 'mar', 'mnar')
-        Type of missigness you want in your dataset
-    th: float between [0,1]
+        Type of missingness you want in your dataset
+    thr: float between [0,1]
         Percentage of missing data in generated data
     dtype: ('int','float')
         Type of data
@@ -65,6 +70,48 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
     raw_data = getattr(corruptor, missingness)()
     return raw_data
 
+def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
+    """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
+
+    Parameters
+    ----------
+    :param nlevels: int
+        Specify the number of different categories in the dataset
+    :param shape: tuple(optional)
+        Size of the randomly generated data
+    :param missingness: string in ('mcar', 'mar', 'mnar')
+        Type of missingness you want in your dataset
+    :param thr: float between [0,1]
+        Percentage of missing data in generated data
+    :return:
+    """
+    if shape[0]*shape[1] < nlevels:
+        raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
+
+    length = len(string.ascii_lowercase)
+    n_fold = math.floor(math.log(nlevels, length))
+    cat_pool = list(string.ascii_lowercase)
+
+    # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
+    if n_fold > 0:
+        for i in range(2, n_fold+2):
+            pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
+            cat_pool.extend([''.join(w) for w in pool_candidate])
+            if len(cat_pool) > nlevels:
+                break
+
+    cat = random.sample(cat_pool, nlevels)
+    data = np.random.choice(cat, shape, replace=True)
+
+    # make sure the data frame has nlevel different categories
+    while len(np.unique(data)) != nlevels:
+        data = np.random.choice(cat, shape, replace=True)
+
+    corruptor = Corruptor(data, thr=thr, dtype=np.str)
+    raw_data = getattr(corruptor, missingness)()
+    return raw_data
+
+
 
 def test_data(mask=np.zeros((3, 3), dtype=bool)):
     """ Returns a dataset to use with tests (INTERNAL USE - FOR UNIT TESTING)
@@ -98,4 +145,4 @@ def mnist(missingness="mcar", thr=0.2):
     dataset = fetch_mldata('MNIST original')
     corruptor = Corruptor(dataset.data, thr=thr)
     data = getattr(corruptor, missingness)()
-    return {"X": data, "Y": dataset.target}
+    return {"X": data, "Y": dataset.target}
\ No newline at end of file

From 8731d10ef4e079179358d1e539a16a203fb922f9 Mon Sep 17 00:00:00 2001
From: "Zheng, Yuhang" <YZheng83@MassMutual.com>
Date: Wed, 31 Jul 2019 14:49:41 -0400
Subject: [PATCH 3/6] add test cases for randc function

---
 test/dataset/test_randc.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 test/dataset/test_randc.py

diff --git a/test/dataset/test_randc.py b/test/dataset/test_randc.py
new file mode 100644
index 0000000..b65b1d4
--- /dev/null
+++ b/test/dataset/test_randc.py
@@ -0,0 +1,26 @@
+import numpy as np
+import pytest
+from impyute.dataset.base import randc
+from impyute.util import BadInputError
+
+def test_raise_error_nlevel_exceed_shape():
+    with pytest.raises(BadInputError) as e:
+        randc(shape=(2, 2))
+    expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape"
+    assert str(e.value) == expected
+
+@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))])
+def test_nlevel_categories(nlevels, shape):
+    """ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan
+       however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique
+       category number would be < nlevel + 1
+    """
+    dataframe = randc(nlevels, shape)
+    assert len(np.unique(dataframe)) <= nlevels + 1
+
+
+@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))])
+def test_dataframe_shape(nlevels, shape):
+    """test if the returned data frame has desired shape"""
+    dataframe = randc(nlevels, shape)
+    assert dataframe.shape == shape

From f1f9337afa52f7280b727b23f5a3444d9cc6e4d3 Mon Sep 17 00:00:00 2001
From: "Zheng, Yuhang" <YZheng83@MassMutual.com>
Date: Thu, 1 Aug 2019 10:59:54 -0400
Subject: [PATCH 4/6] explicitly cast math.floor() into int for python 2
 compatibility

---
 impyute/dataset/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py
index 5f2fe9c..8c887dc 100644
--- a/impyute/dataset/base.py
+++ b/impyute/dataset/base.py
@@ -89,7 +89,7 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
         raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
 
     length = len(string.ascii_lowercase)
-    n_fold = math.floor(math.log(nlevels, length))
+    n_fold = int(math.floor(math.log(nlevels, length)))
     cat_pool = list(string.ascii_lowercase)
 
     # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data

From 4a0d6a9e0db1e8b39b81093b58f9c64e9f744667 Mon Sep 17 00:00:00 2001
From: "Zheng, Yuhang" <YZheng83@MassMutual.com>
Date: Thu, 1 Aug 2019 11:33:15 -0400
Subject: [PATCH 5/6] update the docstring

---
 impyute/dataset/base.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py
index 8c887dc..5f21b4e 100644
--- a/impyute/dataset/base.py
+++ b/impyute/dataset/base.py
@@ -75,15 +75,18 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
 
     Parameters
     ----------
-    :param nlevels: int
+    nlevels: int
         Specify the number of different categories in the dataset
-    :param shape: tuple(optional)
+    shape: tuple(optional)
         Size of the randomly generated data
-    :param missingness: string in ('mcar', 'mar', 'mnar')
+    missingness: string in ('mcar', 'mar', 'mnar')
         Type of missingness you want in your dataset
-    :param thr: float between [0,1]
+    thr: float between [0,1]
         Percentage of missing data in generated data
-    :return:
+
+    Returns
+    -------
+    numpy.ndarray
     """
     if shape[0]*shape[1] < nlevels:
         raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")

From b1ac20cca5a628ebd4a8c992d638185f63939146 Mon Sep 17 00:00:00 2001
From: "Zheng, Yuhang" <YZheng83@MassMutual.com>
Date: Thu, 1 Aug 2019 11:33:32 -0400
Subject: [PATCH 6/6] update the docstring

---
 impyute/dataset/corrupt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py
index 0b75705..846b27c 100644
--- a/impyute/dataset/corrupt.py
+++ b/impyute/dataset/corrupt.py
@@ -9,7 +9,7 @@ class Corruptor:
     ----------
     data: np.ndarray
         Matrix of values with no NaN's that you want to add NaN's to.
-    th: float (optional)
+    thr: float (optional)
         The percentage of null values you want in your dataset, a number
         between 0 and 1.