Skip to content

Commit

Permalink
Merge pull request #86 from xyz8983/DDFG-add-randc-function
Browse files Browse the repository at this point in the history
Ddfg add randc function
  • Loading branch information
eltonlaw committed Aug 1, 2019
2 parents 45412a4 + b1ac20c commit 8569a8d
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 8 deletions.
60 changes: 55 additions & 5 deletions impyute/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
""" Shared functions to load/generate data """
import numpy as np
import string
import random
import math
import itertools
from impyute.dataset.corrupt import Corruptor
from impyute.util import BadInputError

def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
""" Return randomly generated dataset of numbers with uniformly
Expand All @@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int")
shape:tuple(optional)
Size of the randomly generated data
missingness: ('mcar', 'mar', 'mnar')
Type of missigness you want in your dataset
th: float between [0,1]
Type of missingness you want in your dataset
thr: float between [0,1]
Percentage of missing data in generated data
dtype: ('int','float')
Type of data
Expand Down Expand Up @@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
shape:tuple(optional)
Size of the randomly generated data
missingness: ('mcar', 'mar', 'mnar')
Type of missigness you want in your dataset
th: float between [0,1]
Type of missingness you want in your dataset
thr: float between [0,1]
Percentage of missing data in generated data
dtype: ('int','float')
Type of data
Expand All @@ -65,6 +70,51 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
raw_data = getattr(corruptor, missingness)()
return raw_data

def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
""" Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
Parameters
----------
nlevels: int
Specify the number of different categories in the dataset
shape: tuple(optional)
Size of the randomly generated data
missingness: string in ('mcar', 'mar', 'mnar')
Type of missingness you want in your dataset
thr: float between [0,1]
Percentage of missing data in generated data
Returns
-------
numpy.ndarray
"""
if shape[0]*shape[1] < nlevels:
raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")

length = len(string.ascii_lowercase)
n_fold = int(math.floor(math.log(nlevels, length)))
cat_pool = list(string.ascii_lowercase)

# when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
if n_fold > 0:
for i in range(2, n_fold+2):
pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
cat_pool.extend([''.join(w) for w in pool_candidate])
if len(cat_pool) > nlevels:
break

cat = random.sample(cat_pool, nlevels)
data = np.random.choice(cat, shape, replace=True)

# make sure the data frame has nlevel different categories
while len(np.unique(data)) != nlevels:
data = np.random.choice(cat, shape, replace=True)

corruptor = Corruptor(data, thr=thr, dtype=np.str)
raw_data = getattr(corruptor, missingness)()
return raw_data



def mnist(missingness="mcar", thr=0.2):
""" Loads corrupted MNIST
Expand All @@ -84,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2):
dataset = fetch_mldata('MNIST original')
corruptor = Corruptor(dataset.data, thr=thr)
data = getattr(corruptor, missingness)()
return {"X": data, "Y": dataset.target}
return {"X": data, "Y": dataset.target}
6 changes: 3 additions & 3 deletions impyute/dataset/corrupt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Corruptor:
----------
data: np.ndarray
Matrix of values with no NaN's that you want to add NaN's to.
th: float (optional)
thr: float (optional)
The percentage of null values you want in your dataset, a number
between 0 and 1.
Expand All @@ -23,10 +23,10 @@ class Corruptor:
Overwrite values with MNAR placed NaN's.
"""
def __init__(self, data, thr=0.2):
def __init__(self, data, thr=0.2, dtype=np.float):
self.dtype = data.dtype
self.shape = np.shape(data)
self.data = data.astype(np.float)
self.data = data.astype(dtype)
self.thr = thr

def mcar(self):
Expand Down
26 changes: 26 additions & 0 deletions test/dataset/test_randc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy as np
import pytest
from impyute.dataset.base import randc
from impyute.util import BadInputError

def test_raise_error_nlevel_exceed_shape():
with pytest.raises(BadInputError) as e:
randc(shape=(2, 2))
expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape"
assert str(e.value) == expected

@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))])
def test_nlevel_categories(nlevels, shape):
"""ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan
however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique
category number would be < nlevel + 1
"""
dataframe = randc(nlevels, shape)
assert len(np.unique(dataframe)) <= nlevels + 1


@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))])
def test_dataframe_shape(nlevels, shape):
"""test if the returned data frame has desired shape"""
dataframe = randc(nlevels, shape)
assert dataframe.shape == shape

0 comments on commit 8569a8d

Please sign in to comment.