From ec28e789c2b69b54d36caa3459b389c182bd81c0 Mon Sep 17 00:00:00 2001 From: "Zheng, Yuhang" Date: Wed, 31 Jul 2019 14:48:24 -0400 Subject: [PATCH 1/6] add extra dtype attribute to define the dtype of dataframe --- impyute/dataset/corrupt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py index 33d1b8b..0b75705 100644 --- a/impyute/dataset/corrupt.py +++ b/impyute/dataset/corrupt.py @@ -23,10 +23,10 @@ class Corruptor: Overwrite values with MNAR placed NaN's. """ - def __init__(self, data, thr=0.2): + def __init__(self, data, thr=0.2, dtype=np.float): self.dtype = data.dtype self.shape = np.shape(data) - self.data = data.astype(np.float) + self.data = data.astype(dtype) self.thr = thr def mcar(self): From c10a4feb6ab4bf5af4a9a33f8078c5b3af491823 Mon Sep 17 00:00:00 2001 From: "Zheng, Yuhang" Date: Wed, 31 Jul 2019 14:49:13 -0400 Subject: [PATCH 2/6] add randc function to randomly generate dataset with categorical data --- impyute/dataset/base.py | 57 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py index f492a8e..5f2fe9c 100644 --- a/impyute/dataset/base.py +++ b/impyute/dataset/base.py @@ -1,6 +1,11 @@ """ Shared functions to load/generate data """ import numpy as np +import string +import random +import math +import itertools from impyute.dataset.corrupt import Corruptor +from impyute.util import BadInputError def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"): """ Return randomly generated dataset of numbers with uniformly @@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int") shape:tuple(optional) Size of the randomly generated data missingness: ('mcar', 'mar', 'mnar') - Type of missigness you want in your dataset - th: float between [0,1] + Type of missingness you want in your dataset + thr: float between [0,1] Percentage of missing data in generated data dtype: ('int','float') Type of data @@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float" shape:tuple(optional) Size of the randomly generated data missingness: ('mcar', 'mar', 'mnar') - Type of missigness you want in your dataset - th: float between [0,1] + Type of missingness you want in your dataset + thr: float between [0,1] Percentage of missing data in generated data dtype: ('int','float') Type of data @@ -65,6 +70,48 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float" raw_data = getattr(corruptor, missingness)() return raw_data +def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2): + """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character) + + Parameters + ---------- + :param nlevels: int + Specify the number of different categories in the dataset + :param shape: tuple(optional) + Size of the randomly generated data + :param missingness: string in ('mcar', 'mar', 'mnar') + Type of missingness you want in your dataset + :param thr: float between [0,1] + Percentage of missing data in generated data + :return: + """ + if shape[0]*shape[1] < nlevels: + raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape") + + length = len(string.ascii_lowercase) + n_fold = math.floor(math.log(nlevels, length)) + cat_pool = list(string.ascii_lowercase) + + # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data + if n_fold > 0: + for i in range(2, n_fold+2): + pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i)) + cat_pool.extend([''.join(w) for w in pool_candidate]) + if len(cat_pool) > nlevels: + break + + cat = random.sample(cat_pool, nlevels) + data = np.random.choice(cat, shape, replace=True) + + # make sure the data frame has nlevel different categories + while len(np.unique(data)) != nlevels: + data = np.random.choice(cat, shape, replace=True) + + corruptor = Corruptor(data, thr=thr, dtype=np.str) + raw_data = getattr(corruptor, missingness)() + return raw_data + + def test_data(mask=np.zeros((3, 3), dtype=bool)): """ Returns a dataset to use with tests (INTERNAL USE - FOR UNIT TESTING) @@ -98,4 +145,4 @@ def mnist(missingness="mcar", thr=0.2): dataset = fetch_mldata('MNIST original') corruptor = Corruptor(dataset.data, thr=thr) data = getattr(corruptor, missingness)() - return {"X": data, "Y": dataset.target} + return {"X": data, "Y": dataset.target} \ No newline at end of file From 8731d10ef4e079179358d1e539a16a203fb922f9 Mon Sep 17 00:00:00 2001 From: "Zheng, Yuhang" Date: Wed, 31 Jul 2019 14:49:41 -0400 Subject: [PATCH 3/6] add test cases for randc function --- test/dataset/test_randc.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test/dataset/test_randc.py diff --git a/test/dataset/test_randc.py b/test/dataset/test_randc.py new file mode 100644 index 0000000..b65b1d4 --- /dev/null +++ b/test/dataset/test_randc.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest +from impyute.dataset.base import randc +from impyute.util import BadInputError + +def test_raise_error_nlevel_exceed_shape(): + with pytest.raises(BadInputError) as e: + randc(shape=(2, 2)) + expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape" + assert str(e.value) == expected + +@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))]) +def test_nlevel_categories(nlevels, shape): + """ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan + however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique + category number would be < nlevel + 1 + """ + dataframe = randc(nlevels, shape) + assert len(np.unique(dataframe)) <= nlevels + 1 + + +@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))]) +def test_dataframe_shape(nlevels, shape): + """test if the returned data frame has desired shape""" + dataframe = randc(nlevels, shape) + assert dataframe.shape == shape From f1f9337afa52f7280b727b23f5a3444d9cc6e4d3 Mon Sep 17 00:00:00 2001 From: "Zheng, Yuhang" Date: Thu, 1 Aug 2019 10:59:54 -0400 Subject: [PATCH 4/6] explicitly cast math.floor() into int for python 2 compatibility --- impyute/dataset/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py index 5f2fe9c..8c887dc 100644 --- a/impyute/dataset/base.py +++ b/impyute/dataset/base.py @@ -89,7 +89,7 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2): raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape") length = len(string.ascii_lowercase) - n_fold = math.floor(math.log(nlevels, length)) + n_fold = int(math.floor(math.log(nlevels, length))) cat_pool = list(string.ascii_lowercase) # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data From 4a0d6a9e0db1e8b39b81093b58f9c64e9f744667 Mon Sep 17 00:00:00 2001 From: "Zheng, Yuhang" Date: Thu, 1 Aug 2019 11:33:15 -0400 Subject: [PATCH 5/6] update the docstring --- impyute/dataset/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py index 8c887dc..5f21b4e 100644 --- a/impyute/dataset/base.py +++ b/impyute/dataset/base.py @@ -75,15 +75,18 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2): Parameters ---------- - :param nlevels: int + nlevels: int Specify the number of different categories in the dataset - :param shape: tuple(optional) + shape: tuple(optional) Size of the randomly generated data - :param missingness: string in ('mcar', 'mar', 'mnar') + missingness: string in ('mcar', 'mar', 'mnar') Type of missingness you want in your dataset - :param thr: float between [0,1] + thr: float between [0,1] Percentage of missing data in generated data - :return: + + Returns + ------- + numpy.ndarray """ if shape[0]*shape[1] < nlevels: raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape") From b1ac20cca5a628ebd4a8c992d638185f63939146 Mon Sep 17 00:00:00 2001 From: "Zheng, Yuhang" Date: Thu, 1 Aug 2019 11:33:32 -0400 Subject: [PATCH 6/6] update the docstring --- impyute/dataset/corrupt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py index 0b75705..846b27c 100644 --- a/impyute/dataset/corrupt.py +++ b/impyute/dataset/corrupt.py @@ -9,7 +9,7 @@ class Corruptor: ---------- data: np.ndarray Matrix of values with no NaN's that you want to add NaN's to. - th: float (optional) + thr: float (optional) The percentage of null values you want in your dataset, a number between 0 and 1.