-
Notifications
You must be signed in to change notification settings - Fork 303
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use IDGenerator for ID columns (#1538)
* Use IDGenerator for key columns without specified regex patterns * lint * temporarily point to RDT master * point to RDT release candidate * use UniformEncoder instead of LabelEncoder * comment * add newlines
- Loading branch information
Showing
6 changed files
with
84 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,8 @@ | |
import pytest | ||
from rdt.errors import ConfigNotSetError | ||
from rdt.errors import NotFittedError as RDTNotFittedError | ||
from rdt.transformers import FloatFormatter, LabelEncoder, UnixTimestampEncoder | ||
from rdt.transformers import ( | ||
AnonymizedFaker, FloatFormatter, IDGenerator, UniformEncoder, UnixTimestampEncoder) | ||
|
||
from sdv.constraints.errors import MissingConstraintColumnError | ||
from sdv.constraints.tabular import Positive, ScalarRange | ||
|
@@ -73,8 +74,8 @@ def test___init__( | |
|
||
mock_default_transformers.return_value = { | ||
'numerical': 'FloatFormatter()', | ||
'categorical': 'LabelEncoder(add_noise=True)', | ||
'boolean': 'LabelEncoder(add_noise=True)', | ||
'categorical': 'UniformEncoder()', | ||
'boolean': 'UniformEncoder()', | ||
'datetime': 'UnixTimestampEncoder()', | ||
'text': 'RegexGenerator()', | ||
'pii': 'AnonymizedFaker()', | ||
|
@@ -114,8 +115,8 @@ def test___init__( | |
|
||
expected_default_transformers = { | ||
'numerical': 'FloatFormatter()', | ||
'categorical': 'LabelEncoder(add_noise=True)', | ||
'boolean': 'LabelEncoder(add_noise=True)', | ||
'categorical': 'UniformEncoder()', | ||
'boolean': 'UniformEncoder()', | ||
'datetime': 'UnixTimestampEncoder()', | ||
'id': 'RegexGenerator()', | ||
'pii': 'AnonymizedFaker()', | ||
|
@@ -758,7 +759,7 @@ def test__update_transformers_by_sdtypes(self): | |
# Setup | ||
instance = Mock() | ||
instance._transformers_by_sdtype = { | ||
'categorical': 'labelencoder', | ||
'categorical': 'UniformEncoder', | ||
'numerical': 'float', | ||
'boolean': None | ||
} | ||
|
@@ -924,7 +925,7 @@ def test__get_transformer_instance_no_kwargs(self): | |
dp = Mock() | ||
dp._transformers_by_sdtype = { | ||
'numerical': 'FloatFormatter', | ||
'categorical': 'LabelEncoder' | ||
'categorical': 'UniformEncoder' | ||
} | ||
|
||
# Run | ||
|
@@ -968,7 +969,7 @@ def test__update_constraint_transformers(self, mock_rdt, mock_log): | |
'map_col#cat_col': ['z#a', 'x#b'], | ||
'low#high': [0.2, 0.5] | ||
}) | ||
dp._get_transformer_instance = Mock(return_value='LabelEncoder') | ||
dp._get_transformer_instance = Mock(return_value='UniformEncoder') | ||
mock_rdt.transformers.FloatFormatter.return_value = 'FloatFormatter' | ||
|
||
config = { | ||
|
@@ -977,8 +978,8 @@ def test__update_constraint_transformers(self, mock_rdt, mock_log): | |
'pii_col': 'FloatFormatter', | ||
'low': 'FloatFormatter', | ||
'high': 'FloatFormatter', | ||
'cat_col': 'LabelEncoder', | ||
'map_col': 'LabelEncoder', | ||
'cat_col': 'UniformEncoder', | ||
'map_col': 'UniformEncoder', | ||
}, | ||
'sdtypes': { | ||
'id_col': 'numerical', | ||
|
@@ -1001,8 +1002,8 @@ def test__update_constraint_transformers(self, mock_rdt, mock_log): | |
'pii_col': 'FloatFormatter', | ||
'low': 'FloatFormatter', | ||
'high': 'FloatFormatter', | ||
'map_col': 'LabelEncoder', | ||
'map_col#cat_col': 'LabelEncoder', | ||
'map_col': 'UniformEncoder', | ||
'map_col#cat_col': 'UniformEncoder', | ||
'low#high': 'FloatFormatter' | ||
}, | ||
'sdtypes': { | ||
|
@@ -1052,17 +1053,23 @@ def test__create_config(self): | |
'email': ['[email protected]', '[email protected]', '[email protected]'], | ||
'first_name': ['John', 'Doe', 'Johanna'], | ||
'id': ['ID_001', 'ID_002', 'ID_003'], | ||
'id_no_regex': ['ID_001', 'ID_002', 'ID_003'], | ||
'id_numeric': [0, 1, 2], | ||
'id_column': ['ID_999', 'ID_999', 'ID_007'], | ||
'date': ['2021-02-01', '2022-03-05', '2023-01-31'] | ||
}) | ||
dp = DataProcessor(SingleTableMetadata(), locales=locales) | ||
dp.metadata = Mock() | ||
dp.create_anonymized_transformer = Mock() | ||
dp.create_regex_generator = Mock() | ||
dp.create_id_generator = Mock() | ||
dp.create_anonymized_transformer.return_value = 'AnonymizedFaker' | ||
dp.create_regex_generator.return_value = 'RegexGenerator' | ||
dp.create_id_generator.return_value = 'IDGenerator' | ||
dp.metadata.primary_key = 'id' | ||
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric'] | ||
dp._primary_key = 'id' | ||
dp._keys = ['id'] | ||
dp._keys = ['id', 'id_no_regex', 'id_numeric'] | ||
dp.metadata.columns = { | ||
'int': {'sdtype': 'numerical'}, | ||
'float': {'sdtype': 'numerical'}, | ||
|
@@ -1071,6 +1078,9 @@ def test__create_config(self): | |
'email': {'sdtype': 'email', 'pii': True}, | ||
'first_name': {'sdtype': 'first_name'}, | ||
'id': {'sdtype': 'id', 'regex_format': 'ID_\\d{3}[0-9]'}, | ||
'id_no_regex': {'sdtype': 'id'}, | ||
'id_numeric': {'sdtype': 'id'}, | ||
'id_column': {'sdtype': 'id'}, | ||
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'} | ||
} | ||
|
||
|
@@ -1091,6 +1101,9 @@ def test__create_config(self): | |
'email': 'pii', | ||
'first_name': 'pii', | ||
'id': 'text', | ||
'id_no_regex': 'text', | ||
'id_numeric': 'text', | ||
'id_column': 'pii', | ||
'date': 'datetime' | ||
} | ||
|
||
|
@@ -1104,10 +1117,10 @@ def test__create_config(self): | |
assert float_transformer.missing_value_replacement == 'mean' | ||
assert float_transformer.missing_value_generation == 'random' | ||
|
||
assert isinstance(config['transformers']['bool'], LabelEncoder) | ||
assert isinstance(config['transformers']['created_bool'], LabelEncoder) | ||
assert isinstance(config['transformers']['categorical'], LabelEncoder) | ||
assert isinstance(config['transformers']['created_categorical'], LabelEncoder) | ||
assert isinstance(config['transformers']['bool'], UniformEncoder) | ||
assert isinstance(config['transformers']['created_bool'], UniformEncoder) | ||
assert isinstance(config['transformers']['categorical'], UniformEncoder) | ||
assert isinstance(config['transformers']['created_categorical'], UniformEncoder) | ||
|
||
assert isinstance(config['transformers']['int'], FloatFormatter) | ||
assert isinstance(config['transformers']['float'], FloatFormatter) | ||
|
@@ -1126,6 +1139,21 @@ def test__create_config(self): | |
assert datetime_transformer.datetime_format == '%Y-%m-%d' | ||
assert dp._primary_key == 'id' | ||
|
||
id_no_regex_transformer = config['transformers']['id_no_regex'] | ||
assert isinstance(id_no_regex_transformer, IDGenerator) | ||
assert id_no_regex_transformer.prefix == 'sdv-id-' | ||
assert id_no_regex_transformer.starting_value == 0 | ||
|
||
id_numeric_transformer = config['transformers']['id_numeric'] | ||
assert isinstance(id_numeric_transformer, IDGenerator) | ||
assert id_numeric_transformer.prefix is None | ||
assert id_numeric_transformer.starting_value == 0 | ||
|
||
id_column_transformer = config['transformers']['id_column'] | ||
assert isinstance(id_column_transformer, AnonymizedFaker) | ||
assert id_column_transformer.function_name == 'bothify' | ||
assert id_column_transformer.function_kwargs == {'text': '#####'} | ||
|
||
dp.create_anonymized_transformer.calls == [ | ||
call('email', {'sdtype': 'email', 'pii': True, 'locales': locales}), | ||
call('first_name', {'sdtype': 'first_name', 'locales': locales}) | ||
|