Skip to content

Commit

Permalink
Support 'unknown' sdtype (#1532)
Browse files Browse the repository at this point in the history
* def support unknown

* add test

* not pii 2
  • Loading branch information
R-Palazzo authored and amontanez24 committed Sep 27, 2023
1 parent 53bb4a3 commit 7ef67b1
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 5 deletions.
9 changes: 9 additions & 0 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,15 @@ def _create_config(self, data, columns_created_by_constraints):
)
sdtypes[column] = 'pii'

elif sdtype == 'unknown':
transformers[column] = AnonymizedFaker(
function_name='bothify',
)
transformers[column].function_kwargs = {
'text': 'sdv-pii-?????',
'letters': '0123456789abcdefghijklmnopqrstuvwxyz'
}

elif pii:
enforce_uniqueness = bool(column in self._keys)
transformers[column] = self.create_anonymized_transformer(
Expand Down
1 change: 1 addition & 0 deletions sdv/metadata/single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class SingleTableMetadata:
'categorical': frozenset(['order', 'order_by']),
'boolean': frozenset([]),
'id': frozenset(['regex_format']),
'unknown': frozenset([]),
}

_DTYPES_TO_SDTYPES = {
Expand Down
22 changes: 22 additions & 0 deletions tests/integration/single_table/test_copulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,25 @@ def test_categorical_column_with_numbers():
unique_values = synthetic_data['category_col'].unique()
assert np.isnan(unique_values).sum() == 1
assert set(unique_values[~np.isnan(unique_values)]) == {1, 2}


def test_unknown_sdtype():
"""Test the ``unknown`` sdtype handling end to end."""
# Setup
data = pd.DataFrame({
'unknown': ['a', 'b', 'c'],
'numerical_col': np.random.rand(3),
})

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)
metadata.update_column('unknown', sdtype='unknown')

synthesizer = GaussianCopulaSynthesizer(metadata)

# Run
synthesizer.fit(data)
synthetic_data = synthesizer.sample(5)

# Assert
assert synthetic_data['unknown'].str.startswith('sdv-pii-').all()
30 changes: 25 additions & 5 deletions tests/unit/data_processing/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,7 +1056,9 @@ def test__create_config(self):
'id_no_regex': ['ID_001', 'ID_002', 'ID_003'],
'id_numeric': [0, 1, 2],
'id_column': ['ID_999', 'ID_999', 'ID_007'],
'date': ['2021-02-01', '2022-03-05', '2023-01-31']
'date': ['2021-02-01', '2022-03-05', '2023-01-31'],
'unknown': ['a', 'b', 'c'],
'address': ['123 Main St', '456 Main St', '789 Main St']
})
dp = DataProcessor(SingleTableMetadata(), locales=locales)
dp.metadata = Mock()
Expand All @@ -1081,7 +1083,9 @@ def test__create_config(self):
'id_no_regex': {'sdtype': 'id'},
'id_numeric': {'sdtype': 'id'},
'id_column': {'sdtype': 'id'},
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'unknown': {'sdtype': 'unknown'},
'address': {'sdtype': 'address', 'pii': False},
}

# Run
Expand All @@ -1104,7 +1108,9 @@ def test__create_config(self):
'id_no_regex': 'text',
'id_numeric': 'text',
'id_column': 'pii',
'date': 'datetime'
'date': 'datetime',
'unknown': 'pii',
'address': 'categorical'
}

int_transformer = config['transformers']['created_int']
Expand All @@ -1124,13 +1130,15 @@ def test__create_config(self):

assert isinstance(config['transformers']['int'], FloatFormatter)
assert isinstance(config['transformers']['float'], FloatFormatter)

anonymized_transformer = config['transformers']['email']
primary_regex_generator = config['transformers']['id']
assert anonymized_transformer == 'AnonymizedFaker'

primary_regex_generator = config['transformers']['id']
assert primary_regex_generator == 'RegexGenerator'

first_name_transformer = config['transformers']['first_name']
assert first_name_transformer == 'AnonymizedFaker'
assert primary_regex_generator == 'RegexGenerator'

datetime_transformer = config['transformers']['date']
assert isinstance(datetime_transformer, UnixTimestampEncoder)
Expand Down Expand Up @@ -1166,6 +1174,18 @@ def test__create_config(self):
False
)

expected_kwargs = {
'text': 'sdv-pii-?????',
'letters': '0123456789abcdefghijklmnopqrstuvwxyz'
}
unknown_transformer = config['transformers']['unknown']
assert isinstance(unknown_transformer, AnonymizedFaker)
assert unknown_transformer.function_name == 'bothify'
assert unknown_transformer.function_kwargs == expected_kwargs

address_column_transformer = config['transformers']['address']
assert isinstance(address_column_transformer, UniformEncoder)

def test_update_transformers_not_fitted(self):
"""Test when ``self._hyper_transformer`` is ``None`` raises a ``NotFittedError``."""
# Setup
Expand Down

0 comments on commit 7ef67b1

Please sign in to comment.