diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py index bdbb02209..9bda9644e 100644 --- a/sdv/data_processing/data_processor.py +++ b/sdv/data_processing/data_processor.py @@ -483,6 +483,15 @@ def _create_config(self, data, columns_created_by_constraints): ) sdtypes[column] = 'pii' + elif sdtype == 'unknown': + transformers[column] = AnonymizedFaker( + function_name='bothify', + ) + transformers[column].function_kwargs = { + 'text': 'sdv-pii-?????', + 'letters': '0123456789abcdefghijklmnopqrstuvwxyz' + } + elif pii: enforce_uniqueness = bool(column in self._keys) transformers[column] = self.create_anonymized_transformer( diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index d56864806..18b2b3ebb 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -32,6 +32,7 @@ class SingleTableMetadata: 'categorical': frozenset(['order', 'order_by']), 'boolean': frozenset([]), 'id': frozenset(['regex_format']), + 'unknown': frozenset([]), } _DTYPES_TO_SDTYPES = { diff --git a/tests/integration/single_table/test_copulas.py b/tests/integration/single_table/test_copulas.py index 6b361ea02..ba5db4cd1 100644 --- a/tests/integration/single_table/test_copulas.py +++ b/tests/integration/single_table/test_copulas.py @@ -394,3 +394,25 @@ def test_categorical_column_with_numbers(): unique_values = synthetic_data['category_col'].unique() assert np.isnan(unique_values).sum() == 1 assert set(unique_values[~np.isnan(unique_values)]) == {1, 2} + + +def test_unknown_sdtype(): + """Test the ``unknown`` sdtype handling end to end.""" + # Setup + data = pd.DataFrame({ + 'unknown': ['a', 'b', 'c'], + 'numerical_col': np.random.rand(3), + }) + + metadata = SingleTableMetadata() + metadata.detect_from_dataframe(data) + metadata.update_column('unknown', sdtype='unknown') + + synthesizer = GaussianCopulaSynthesizer(metadata) + + # Run + synthesizer.fit(data) + synthetic_data = synthesizer.sample(5) + + # Assert + assert synthetic_data['unknown'].str.startswith('sdv-pii-').all() diff --git a/tests/unit/data_processing/test_data_processor.py b/tests/unit/data_processing/test_data_processor.py index a2ae15b92..862c977a7 100644 --- a/tests/unit/data_processing/test_data_processor.py +++ b/tests/unit/data_processing/test_data_processor.py @@ -1056,7 +1056,9 @@ def test__create_config(self): 'id_no_regex': ['ID_001', 'ID_002', 'ID_003'], 'id_numeric': [0, 1, 2], 'id_column': ['ID_999', 'ID_999', 'ID_007'], - 'date': ['2021-02-01', '2022-03-05', '2023-01-31'] + 'date': ['2021-02-01', '2022-03-05', '2023-01-31'], + 'unknown': ['a', 'b', 'c'], + 'address': ['123 Main St', '456 Main St', '789 Main St'] }) dp = DataProcessor(SingleTableMetadata(), locales=locales) dp.metadata = Mock() @@ -1081,7 +1083,9 @@ def test__create_config(self): 'id_no_regex': {'sdtype': 'id'}, 'id_numeric': {'sdtype': 'id'}, 'id_column': {'sdtype': 'id'}, - 'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'} + 'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}, + 'unknown': {'sdtype': 'unknown'}, + 'address': {'sdtype': 'address', 'pii': False}, } # Run @@ -1104,7 +1108,9 @@ def test__create_config(self): 'id_no_regex': 'text', 'id_numeric': 'text', 'id_column': 'pii', - 'date': 'datetime' + 'date': 'datetime', + 'unknown': 'pii', + 'address': 'categorical' } int_transformer = config['transformers']['created_int'] @@ -1124,13 +1130,15 @@ def test__create_config(self): assert isinstance(config['transformers']['int'], FloatFormatter) assert isinstance(config['transformers']['float'], FloatFormatter) + anonymized_transformer = config['transformers']['email'] - primary_regex_generator = config['transformers']['id'] assert anonymized_transformer == 'AnonymizedFaker' + primary_regex_generator = config['transformers']['id'] + assert primary_regex_generator == 'RegexGenerator' + first_name_transformer = config['transformers']['first_name'] assert first_name_transformer == 'AnonymizedFaker' - assert primary_regex_generator == 'RegexGenerator' datetime_transformer = config['transformers']['date'] assert isinstance(datetime_transformer, UnixTimestampEncoder) @@ -1166,6 +1174,18 @@ def test__create_config(self): False ) + expected_kwargs = { + 'text': 'sdv-pii-?????', + 'letters': '0123456789abcdefghijklmnopqrstuvwxyz' + } + unknown_transformer = config['transformers']['unknown'] + assert isinstance(unknown_transformer, AnonymizedFaker) + assert unknown_transformer.function_name == 'bothify' + assert unknown_transformer.function_kwargs == expected_kwargs + + address_column_transformer = config['transformers']['address'] + assert isinstance(address_column_transformer, UniformEncoder) + def test_update_transformers_not_fitted(self): """Test when ``self._hyper_transformer`` is ``None`` raises a ``NotFittedError``.""" # Setup