diff --git a/tests/integration/data_processing/test_data_processor.py b/tests/integration/data_processing/test_data_processor.py index 65f865d2f..bb87029b4 100644 --- a/tests/integration/data_processing/test_data_processor.py +++ b/tests/integration/data_processing/test_data_processor.py @@ -53,7 +53,7 @@ def test_with_anonymized_columns(self): data, metadata = download_demo('single_table', 'adult') # Add anonymized field - metadata.update_column('adult', 'occupation', sdtype='job', pii=True) + metadata.update_column('occupation', 'adult', sdtype='job', pii=True) # Instance ``DataProcessor`` dp = DataProcessor(metadata._convert_to_single_table()) @@ -101,11 +101,11 @@ def test_with_anonymized_columns_and_primary_key(self): data, metadata = download_demo('single_table', 'adult') # Add anonymized field - metadata.update_column('adult', 'occupation', sdtype='job', pii=True) + metadata.update_column('occupation', 'adult', sdtype='job', pii=True) # Add primary key field - metadata.add_column('adult', 'id', sdtype='id', regex_format='ID_\\d{4}[0-9]') - metadata.set_primary_key('adult', 'id') + metadata.add_column('id', 'adult', sdtype='id', regex_format='ID_\\d{4}[0-9]') + metadata.set_primary_key('id', 'adult') # Add id size = len(data) @@ -159,8 +159,8 @@ def test_with_primary_key_numerical(self): adult_metadata = Metadata.detect_from_dataframes({'adult': data}) # Add primary key field - adult_metadata.add_column('adult', 'id', sdtype='id') - adult_metadata.set_primary_key('adult', 'id') + adult_metadata.add_column('id', 'adult', sdtype='id') + adult_metadata.set_primary_key('id', 'adult') # Add id size = len(data) @@ -198,13 +198,13 @@ def test_with_alternate_keys(self): adult_metadata = Metadata.detect_from_dataframes({'adult': data}) # Add primary key field - adult_metadata.add_column('adult', 'id', sdtype='id') - adult_metadata.set_primary_key('adult', 'id') + adult_metadata.add_column('id', 'adult', sdtype='id') + adult_metadata.set_primary_key('id', 'adult') - adult_metadata.add_column('adult', 'secondary_id', sdtype='id') - adult_metadata.update_column('adult', 'fnlwgt', sdtype='id', regex_format='ID_\\d{4}[0-9]') + adult_metadata.add_column('secondary_id', 'adult', sdtype='id') + adult_metadata.update_column('fnlwgt', 'adult', sdtype='id', regex_format='ID_\\d{4}[0-9]') - adult_metadata.add_alternate_keys('adult', ['secondary_id', 'fnlwgt']) + adult_metadata.add_alternate_keys(['secondary_id', 'fnlwgt'], 'adult') # Add id size = len(data) @@ -345,7 +345,7 @@ def test_localized_anonymized_columns(self): """Test data processor uses the default locale for anonymized columns.""" # Setup data, metadata = download_demo('single_table', 'adult') - metadata.update_column('adult', 'occupation', sdtype='job', pii=True) + metadata.update_column('occupation', 'adult', sdtype='job', pii=True) dp = DataProcessor(metadata._convert_to_single_table(), locales=['en_CA', 'fr_CA']) diff --git a/tests/integration/evaluation/test_single_table.py b/tests/integration/evaluation/test_single_table.py index 8b3cf23fa..d943b02d3 100644 --- a/tests/integration/evaluation/test_single_table.py +++ b/tests/integration/evaluation/test_single_table.py @@ -12,7 +12,7 @@ def test_evaluation(): data = pd.DataFrame({'col': [1, 2, 3]}) metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'col', sdtype='numerical') + metadata.add_column('col', 'table', sdtype='numerical') synthesizer = GaussianCopulaSynthesizer(metadata, default_distribution='truncnorm') # Run and Assert diff --git a/tests/integration/metadata/test_metadata.py b/tests/integration/metadata/test_metadata.py index 5d460e1c9..fd5909b24 100644 --- a/tests/integration/metadata/test_metadata.py +++ b/tests/integration/metadata/test_metadata.py @@ -419,54 +419,46 @@ def test_any_metadata_update_single_table(method, args, kwargs): metadata.update_column( table_name='fake_hotel_guests', column_name='billing_address', sdtype='street_address' ) - metadata_kwargs = deepcopy(metadata) - metadata_args = deepcopy(metadata) - metadata_kwargs_with_table_name = deepcopy(metadata) - metadata_args_with_table_name = deepcopy(metadata) + parameter = [kwargs[arg] for arg in args] + remaining_kwargs = {key: value for key, value in kwargs.items() if key not in args} + metadata_before = deepcopy(metadata).to_dict() # Run - result = getattr(metadata_kwargs, method)(**kwargs) - getattr(metadata_kwargs_with_table_name, method)(table_name='fake_hotel_guests', **kwargs) - arg_values = [kwargs[arg] for arg in args] - extra_param = {key: value for key, value in kwargs.items() if key not in args} - getattr(metadata_args, method)(*arg_values, **extra_param) - getattr(metadata_args_with_table_name, method)('fake_hotel_guests', *arg_values, **extra_param) + result = getattr(metadata, method)(*parameter, **remaining_kwargs) # Assert - expected_dict = metadata_kwargs.to_dict() + expected_dict = metadata.to_dict() if method != 'get_column_names': - assert expected_dict != metadata.to_dict() + assert expected_dict != metadata_before else: assert result == ['checkin_date', 'checkout_date'] - other_metadata = [metadata_args, metadata_kwargs_with_table_name, metadata_args_with_table_name] - for metadata_obj in other_metadata: - assert expected_dict == metadata_obj.to_dict() - @pytest.mark.parametrize('method, args, kwargs', params) def test_any_metadata_update_multi_table(method, args, kwargs): """Test that any method that updates metadata works for multi-table case.""" # Setup - args.insert(0, 'table_name') - kwargs['table_name'] = 'guests' _, metadata = download_demo('multi_table', 'fake_hotels') metadata.update_column( table_name='guests', column_name='billing_address', sdtype='street_address' ) - metadata_kwargs = deepcopy(metadata) - metadata_args = deepcopy(metadata) + parameter = [kwargs[arg] for arg in args] + remaining_kwargs = {key: value for key, value in kwargs.items() if key not in args} + metadata_before = deepcopy(metadata).to_dict() + expected_error = re.escape( + 'Metadata contains more than one table, please specify the `table_name`.' + ) # Run - result = getattr(metadata_kwargs, method)(**kwargs) - arg_values = [kwargs[arg] for arg in args] - extra_param = {key: value for key, value in kwargs.items() if key not in args} - getattr(metadata_args, method)(*arg_values, **extra_param) + with pytest.raises(ValueError, match=expected_error): + getattr(metadata, method)(*parameter, **remaining_kwargs) + + parameter.append('guests') + result = getattr(metadata, method)(*parameter, **remaining_kwargs) # Assert - expected_dict = metadata_kwargs.to_dict() - assert expected_dict == metadata_args.to_dict() + expected_dict = metadata.to_dict() if method != 'get_column_names': - assert expected_dict != metadata.to_dict() + assert expected_dict != metadata_before else: assert result == ['checkin_date', 'checkout_date'] diff --git a/tests/integration/metadata/test_multi_table.py b/tests/integration/metadata/test_multi_table.py index 1384eaf74..23e16de42 100644 --- a/tests/integration/metadata/test_multi_table.py +++ b/tests/integration/metadata/test_multi_table.py @@ -34,11 +34,11 @@ def _validate_sdtypes(cls, columns_to_sdtypes): mock_rdt_transformers.address.RandomLocationGenerator = RandomLocationGeneratorMock _, instance = download_demo('multi_table', 'fake_hotels') - instance.update_column('hotels', 'city', sdtype='city') - instance.update_column('hotels', 'state', sdtype='state') + instance.update_column('city', 'hotels', sdtype='city') + instance.update_column('state', 'hotels', sdtype='state') # Run - instance.add_column_relationship('hotels', 'address', ['city', 'state']) + instance.add_column_relationship('address', ['city', 'state'], 'hotels') # Assert instance.validate() @@ -303,9 +303,9 @@ def test_get_table_metadata(): """Test the ``get_table_metadata`` method.""" # Setup metadata = get_multi_table_metadata() - metadata.add_column('nesreca', 'latitude', sdtype='latitude') - metadata.add_column('nesreca', 'longitude', sdtype='longitude') - metadata.add_column_relationship('nesreca', 'gps', ['latitude', 'longitude']) + metadata.add_column('latitude', 'nesreca', sdtype='latitude') + metadata.add_column('longitude', 'nesreca', sdtype='longitude') + metadata.add_column_relationship('gps', ['latitude', 'longitude'], 'nesreca') # Run table_metadata = metadata.get_table_metadata('nesreca') diff --git a/tests/integration/metadata/test_visualization.py b/tests/integration/metadata/test_visualization.py index 2d801ea5d..23a7aee17 100644 --- a/tests/integration/metadata/test_visualization.py +++ b/tests/integration/metadata/test_visualization.py @@ -26,9 +26,9 @@ def test_visualize_graph_for_multi_table(): data2 = pd.DataFrame({'\\|=/bla@#$324%^,"&*()><...': ['a', 'b', 'c']}) tables = {'1': data1, '2': data2} metadata = Metadata.detect_from_dataframes(tables) - metadata.update_column('1', '\\|=/bla@#$324%^,"&*()><...', sdtype='id') - metadata.update_column('2', '\\|=/bla@#$324%^,"&*()><...', sdtype='id') - metadata.set_primary_key('1', '\\|=/bla@#$324%^,"&*()><...') + metadata.update_column('\\|=/bla@#$324%^,"&*()><...', '1', sdtype='id') + metadata.update_column('\\|=/bla@#$324%^,"&*()><...', '2', sdtype='id') + metadata.set_primary_key('\\|=/bla@#$324%^,"&*()><...', '1') metadata.add_relationship( '1', '2', '\\|=/bla@#$324%^,"&*()><...', '\\|=/bla@#$324%^,"&*()><...' ) diff --git a/tests/integration/multi_table/test_hma.py b/tests/integration/multi_table/test_hma.py index e4ee8e083..e7f2cb2c2 100644 --- a/tests/integration/multi_table/test_hma.py +++ b/tests/integration/multi_table/test_hma.py @@ -87,8 +87,8 @@ def test_hma_reset_sampling(self): faker = Faker() data, metadata = download_demo('multi_table', 'got_families') metadata.add_column( - 'characters', 'ssn', + 'characters', sdtype='ssn', ) data['characters']['ssn'] = [faker.lexify() for _ in range(len(data['characters']))] @@ -126,7 +126,7 @@ def test_get_info(self): today = datetime.datetime.today().strftime('%Y-%m-%d') metadata = Metadata() metadata.add_table('tab') - metadata.add_column('tab', 'col', sdtype='numerical') + metadata.add_column('col', 'tab', sdtype='numerical') synthesizer = HMASynthesizer(metadata) # Run @@ -221,12 +221,12 @@ def get_custom_constraint_data_and_metadata(self): metadata = Metadata() metadata.detect_table_from_dataframe('parent', parent_data) - metadata.update_column('parent', 'primary_key', sdtype='id') + metadata.update_column('primary_key', 'parent', sdtype='id') metadata.detect_table_from_dataframe('child', child_data) - metadata.update_column('child', 'user_id', sdtype='id') - metadata.update_column('child', 'id', sdtype='id') - metadata.set_primary_key('parent', 'primary_key') - metadata.set_primary_key('child', 'id') + metadata.update_column('user_id', 'child', sdtype='id') + metadata.update_column('id', 'child', sdtype='id') + metadata.set_primary_key('primary_key', 'parent') + metadata.set_primary_key('id', 'child') metadata.add_relationship( parent_primary_key='primary_key', parent_table_name='parent', @@ -361,10 +361,10 @@ def test_hma_with_inequality_constraint(self): metadata = Metadata() metadata.detect_table_from_dataframe(table_name='parent_table', data=parent_table) - metadata.update_column('parent_table', 'id', sdtype='id') + metadata.update_column('id', 'parent_table', sdtype='id') metadata.detect_table_from_dataframe(table_name='child_table', data=child_table) - metadata.update_column('child_table', 'id', sdtype='id') - metadata.update_column('child_table', 'parent_id', sdtype='id') + metadata.update_column('id', 'child_table', sdtype='id') + metadata.update_column('parent_id', 'child_table', sdtype='id') metadata.set_primary_key(table_name='parent_table', column_name='id') metadata.set_primary_key(table_name='child_table', column_name='id') @@ -452,14 +452,14 @@ def test_hma_primary_key_and_foreign_key_only(self): for table_name, table in data.items(): metadata.detect_table_from_dataframe(table_name, table) - metadata.update_column('users', 'user_id', sdtype='id') - metadata.update_column('sessions', 'session_id', sdtype='id') - metadata.update_column('games', 'game_id', sdtype='id') - metadata.update_column('games', 'session_id', sdtype='id') - metadata.update_column('games', 'user_id', sdtype='id') - metadata.set_primary_key('users', 'user_id') - metadata.set_primary_key('sessions', 'session_id') - metadata.set_primary_key('games', 'game_id') + metadata.update_column('user_id', 'users', sdtype='id') + metadata.update_column('session_id', 'sessions', sdtype='id') + metadata.update_column('game_id', 'games', sdtype='id') + metadata.update_column('session_id', 'games', sdtype='id') + metadata.update_column('user_id', 'games', sdtype='id') + metadata.set_primary_key('user_id', 'users') + metadata.set_primary_key('session_id', 'sessions') + metadata.set_primary_key('game_id', 'games') metadata.add_relationship('users', 'games', 'user_id', 'user_id') metadata.add_relationship('sessions', 'games', 'session_id', 'session_id') @@ -1351,25 +1351,25 @@ def test_null_foreign_keys(self): metadata = Metadata() metadata.add_table('parent_table1') - metadata.add_column('parent_table1', 'id', sdtype='id') - metadata.set_primary_key('parent_table1', 'id') + metadata.add_column('id', 'parent_table1', sdtype='id') + metadata.set_primary_key('id', 'parent_table1') metadata.add_table('parent_table2') - metadata.add_column('parent_table2', 'id', sdtype='id') - metadata.set_primary_key('parent_table2', 'id') + metadata.add_column('id', 'parent_table2', sdtype='id') + metadata.set_primary_key('id', 'parent_table2') metadata.add_table('child_table1') - metadata.add_column('child_table1', 'id', sdtype='id') - metadata.set_primary_key('child_table1', 'id') - metadata.add_column('child_table1', 'fk1', sdtype='id') - metadata.add_column('child_table1', 'fk2', sdtype='id') + metadata.add_column('id', 'child_table1', sdtype='id') + metadata.set_primary_key('id', 'child_table1') + metadata.add_column('fk1', 'child_table1', sdtype='id') + metadata.add_column('fk2', 'child_table1', sdtype='id') metadata.add_table('child_table2') - metadata.add_column('child_table2', 'id', sdtype='id') - metadata.set_primary_key('child_table2', 'id') - metadata.add_column('child_table2', 'fk1', sdtype='id') - metadata.add_column('child_table2', 'fk2', sdtype='id') - metadata.add_column('child_table2', 'cat_type', sdtype='categorical') + metadata.add_column('id', 'child_table2', sdtype='id') + metadata.set_primary_key('id', 'child_table2') + metadata.add_column('fk1', 'child_table2', sdtype='id') + metadata.add_column('fk2', 'child_table2', sdtype='id') + metadata.add_column('cat_type', 'child_table2', sdtype='categorical') metadata.add_relationship( parent_table_name='parent_table1', @@ -1842,7 +1842,7 @@ def test_fit_and_sample_numerical_col_names(): } ] metadata = Metadata.load_from_dict(metadata_dict) - metadata.set_primary_key('0', '1') + metadata.set_primary_key('1', '0') # Run synth = HMASynthesizer(metadata) @@ -1875,11 +1875,11 @@ def test_detect_from_dataframe_numerical_col(): metadata = Metadata() metadata.detect_table_from_dataframe('parent_data', parent_data) metadata.detect_table_from_dataframe('child_data', child_data) - metadata.update_column('parent_data', '1', sdtype='id') - metadata.update_column('child_data', '3', sdtype='id') - metadata.update_column('child_data', '4', sdtype='id') - metadata.set_primary_key('parent_data', '1') - metadata.set_primary_key('child_data', '4') + metadata.update_column('1', 'parent_data', sdtype='id') + metadata.update_column('3', 'child_data', sdtype='id') + metadata.update_column('4', 'child_data', sdtype='id') + metadata.set_primary_key('1', 'parent_data') + metadata.set_primary_key('4', 'child_data') metadata.add_relationship( parent_primary_key='1', parent_table_name='parent_data', @@ -1888,11 +1888,11 @@ def test_detect_from_dataframe_numerical_col(): ) test_metadata = Metadata.detect_from_dataframes(data) - test_metadata.update_column('parent_data', '1', sdtype='id') - test_metadata.update_column('child_data', '3', sdtype='id') - test_metadata.update_column('child_data', '4', sdtype='id') - test_metadata.set_primary_key('parent_data', '1') - test_metadata.set_primary_key('child_data', '4') + test_metadata.update_column('1', 'parent_data', sdtype='id') + test_metadata.update_column('3', 'child_data', sdtype='id') + test_metadata.update_column('4', 'child_data', sdtype='id') + test_metadata.set_primary_key('1', 'parent_data') + test_metadata.set_primary_key('4', 'child_data') test_metadata.add_relationship( parent_primary_key='1', parent_table_name='parent_data', @@ -2005,13 +2005,13 @@ def test_hma_synthesizer_with_fixed_combinations(): # Creating metadata for the dataset metadata = Metadata.detect_from_dataframes(data) - metadata.update_column('users', 'user_id', sdtype='id') - metadata.update_column('records', 'record_id', sdtype='id') - metadata.update_column('records', 'user_id', sdtype='id') - metadata.update_column('records', 'location_id', sdtype='id') - metadata.update_column('locations', 'location_id', sdtype='id') - metadata.set_primary_key('users', 'user_id') - metadata.set_primary_key('locations', 'location_id') + metadata.update_column('user_id', 'users', sdtype='id') + metadata.update_column('record_id', 'records', sdtype='id') + metadata.update_column('user_id', 'records', sdtype='id') + metadata.update_column('location_id', 'records', sdtype='id') + metadata.update_column('location_id', 'locations', sdtype='id') + metadata.set_primary_key('user_id', 'users') + metadata.set_primary_key('location_id', 'locations') metadata.add_relationship('users', 'records', 'user_id', 'user_id') metadata.add_relationship('locations', 'records', 'location_id', 'location_id') @@ -2053,8 +2053,8 @@ def test_fit_int_primary_key_regex_includes_zero(regex): 'child_data': child_data, } metadata = Metadata.detect_from_dataframes(data) - metadata.update_column('parent_data', 'parent_id', sdtype='id', regex_format=regex) - metadata.set_primary_key('parent_data', 'parent_id') + metadata.update_column('parent_id', 'parent_data', sdtype='id', regex_format=regex) + metadata.set_primary_key('parent_id', 'parent_data') # Run and Assert instance = HMASynthesizer(metadata) diff --git a/tests/integration/sequential/test_par.py b/tests/integration/sequential/test_par.py index 6ae0af54e..620a95cf2 100644 --- a/tests/integration/sequential/test_par.py +++ b/tests/integration/sequential/test_par.py @@ -22,9 +22,11 @@ def _get_par_data_and_metadata(): 'context': ['a', 'a', 'b', 'b'], }) metadata = Metadata.detect_from_dataframes({'table': data}) - metadata.update_column('table', 'entity', sdtype='id') - metadata.set_sequence_key('table', 'entity') - metadata.set_sequence_index('table', 'date') + metadata.update_column('entity', 'table', sdtype='id') + metadata.set_sequence_key('entity', 'table') + + metadata.set_sequence_index('date', 'table') + return data, metadata @@ -34,9 +36,11 @@ def test_par(): data = load_demo() data['date'] = pd.to_datetime(data['date']) metadata = Metadata.detect_from_dataframes({'table': data}) - metadata.update_column('table', 'store_id', sdtype='id') - metadata.set_sequence_key('table', 'store_id') - metadata.set_sequence_index('table', 'date') + metadata.update_column('store_id', 'table', sdtype='id') + metadata.set_sequence_key('store_id', 'table') + + metadata.set_sequence_index('date', 'table') + model = PARSynthesizer( metadata=metadata, context_columns=['region'], @@ -67,9 +71,10 @@ def test_column_after_date_simple(): 'col2': ['hello', 'world'], }) metadata = Metadata.detect_from_dataframes({'table': data}) - metadata.update_column('table', 'col', sdtype='id') - metadata.set_sequence_key('table', 'col') - metadata.set_sequence_index('table', 'date') + metadata.update_column('col', 'table', sdtype='id') + metadata.set_sequence_key('col', 'table') + + metadata.set_sequence_index('date', 'table') # Run model = PARSynthesizer(metadata=metadata, epochs=1) @@ -347,8 +352,10 @@ def test_par_unique_sequence_index_with_enforce_min_max(): ) metadata = Metadata.detect_from_dataframes({'table': test_df}) metadata.update_column(table_name='table', column_name='s_key', sdtype='id') - metadata.set_sequence_key('table', 's_key') - metadata.set_sequence_index('table', 'visits') + metadata.set_sequence_key('s_key', 'table') + + metadata.set_sequence_index('visits', 'table') + synthesizer = PARSynthesizer( metadata, enforce_min_max_values=True, enforce_rounding=False, epochs=100, verbose=True ) @@ -441,7 +448,7 @@ def test_par_categorical_column_represented_by_floats(): # Setup data, metadata = download_demo('sequential', 'nasdaq100_2019') data['category'] = [100.0 if i % 2 == 0 else 50.0 for i in data.index] - metadata.add_column('nasdaq100_2019', 'category', sdtype='categorical') + metadata.add_column('category', 'nasdaq100_2019', sdtype='categorical') # Run synth = PARSynthesizer(metadata) diff --git a/tests/integration/single_table/test_base.py b/tests/integration/single_table/test_base.py index 7ab9bb27c..88265ec1f 100644 --- a/tests/integration/single_table/test_base.py +++ b/tests/integration/single_table/test_base.py @@ -93,9 +93,9 @@ def test_sample_from_conditions_with_batch_size(): metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'column1', sdtype='numerical') - metadata.add_column('table', 'column2', sdtype='numerical') - metadata.add_column('table', 'column3', sdtype='numerical') + metadata.add_column('column1', 'table', sdtype='numerical') + metadata.add_column('column2', 'table', sdtype='numerical') + metadata.add_column('column3', 'table', sdtype='numerical') model = GaussianCopulaSynthesizer(metadata) model.fit(data) @@ -120,9 +120,9 @@ def test_sample_from_conditions_negative_float(): metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'column1', sdtype='numerical') - metadata.add_column('table', 'column2', sdtype='numerical') - metadata.add_column('table', 'column3', sdtype='numerical') + metadata.add_column('column1', 'table', sdtype='numerical') + metadata.add_column('column2', 'table', sdtype='numerical') + metadata.add_column('column3', 'table', sdtype='numerical') model = GaussianCopulaSynthesizer(metadata) model.fit(data) @@ -203,7 +203,7 @@ def test_sample_keys_are_scrambled(): """Test that the keys are scrambled in the sampled data.""" # Setup data, metadata = download_demo(modality='single_table', dataset_name='fake_hotel_guests') - metadata.update_column('fake_hotel_guests', 'guest_email', sdtype='id', regex_format='[A-Z]{3}') + metadata.update_column('guest_email', 'fake_hotel_guests', sdtype='id', regex_format='[A-Z]{3}') synthesizer = GaussianCopulaSynthesizer(metadata) synthesizer.fit(data) @@ -234,9 +234,9 @@ def test_multiple_fits(): }) metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'city', sdtype='categorical') - metadata.add_column('table', 'state', sdtype='categorical') - metadata.add_column('table', 'measurement', sdtype='numerical') + metadata.add_column('city', 'table', sdtype='categorical') + metadata.add_column('state', 'table', sdtype='categorical') + metadata.add_column('measurement', 'table', sdtype='numerical') constraint = { 'constraint_class': 'FixedCombinations', 'constraint_parameters': {'column_names': ['city', 'state']}, @@ -338,7 +338,7 @@ def test_transformers_correctly_auto_assigned(): metadata.update_column( table_name='table', column_name='primary_key', sdtype='id', regex_format='user-[0-9]{3}' ) - metadata.set_primary_key('table', 'primary_key') + metadata.set_primary_key('primary_key', 'table') metadata.update_column(table_name='table', column_name='pii_col', sdtype='address', pii=True) synthesizer = GaussianCopulaSynthesizer( metadata, enforce_min_max_values=False, enforce_rounding=False @@ -428,7 +428,7 @@ def test_auto_assign_transformers_and_update_with_pii(): # Run metadata.update_column(table_name='table', column_name='id', sdtype='first_name') metadata.update_column(table_name='table', column_name='name', sdtype='name') - metadata.set_primary_key('table', 'id') + metadata.set_primary_key('id', 'table') synthesizer = GaussianCopulaSynthesizer(metadata) synthesizer.auto_assign_transformers(data) @@ -457,8 +457,8 @@ def test_refitting_a_model(): metadata = Metadata.detect_from_dataframes({'table': data}) metadata.update_column(table_name='table', column_name='name', sdtype='name') - metadata.update_column('table', 'id', sdtype='id') - metadata.set_primary_key('table', 'id') + metadata.update_column('id', 'table', sdtype='id') + metadata.set_primary_key('id', 'table') synthesizer = GaussianCopulaSynthesizer(metadata) synthesizer.fit(data) @@ -483,7 +483,7 @@ def test_get_info(): today = datetime.datetime.today().strftime('%Y-%m-%d') metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'col', sdtype='numerical') + metadata.add_column('col', 'table', sdtype='numerical') synthesizer = GaussianCopulaSynthesizer(metadata) # Run @@ -628,7 +628,7 @@ def test_metadata_updated_no_warning(mock__fit, tmp_path): # Run 3 instance = BaseSingleTableSynthesizer(metadata_detect) - metadata_detect.update_column('mock_table', 'col 1', sdtype='categorical') + metadata_detect.update_column('col 1', 'mock_table', sdtype='categorical') file_name = tmp_path / 'singletable_2.json' metadata_detect.save_to_json(file_name) with warnings.catch_warnings(record=True) as captured_warnings: diff --git a/tests/integration/single_table/test_constraints.py b/tests/integration/single_table/test_constraints.py index 3477d4557..6e7a8c9e0 100644 --- a/tests/integration/single_table/test_constraints.py +++ b/tests/integration/single_table/test_constraints.py @@ -74,9 +74,9 @@ def test_fit_with_unique_constraint_on_data_with_only_index_column(): metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'key', sdtype='id') - metadata.add_column('table', 'index', sdtype='categorical') - metadata.set_primary_key('table', 'key') + metadata.add_column('key', 'table', sdtype='id') + metadata.add_column('index', 'table', sdtype='categorical') + metadata.set_primary_key('key', 'table') model = GaussianCopulaSynthesizer(metadata) constraint = { @@ -139,10 +139,10 @@ def test_fit_with_unique_constraint_on_data_which_has_index_column(): metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'key', sdtype='id') - metadata.add_column('table', 'index', sdtype='categorical') - metadata.add_column('table', 'test_column', sdtype='categorical') - metadata.set_primary_key('table', 'key') + metadata.add_column('key', 'table', sdtype='id') + metadata.add_column('index', 'table', sdtype='categorical') + metadata.add_column('test_column', 'table', sdtype='categorical') + metadata.set_primary_key('key', 'table') model = GaussianCopulaSynthesizer(metadata) constraint = { @@ -198,9 +198,9 @@ def test_fit_with_unique_constraint_on_data_subset(): metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'key', sdtype='id') - metadata.add_column('table', 'test_column', sdtype='categorical') - metadata.set_primary_key('table', 'key') + metadata.add_column('key', 'table', sdtype='id') + metadata.add_column('test_column', 'table', sdtype='categorical') + metadata.set_primary_key('key', 'table') test_df = test_df.iloc[[1, 3, 4]] constraint = { @@ -295,9 +295,9 @@ def test_conditional_sampling_constraint_uses_reject_sampling(gm_mock, isinstanc metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'city', sdtype='categorical') - metadata.add_column('table', 'state', sdtype='categorical') - metadata.add_column('table', 'age', sdtype='numerical') + metadata.add_column('city', 'table', sdtype='categorical') + metadata.add_column('state', 'table', sdtype='categorical') + metadata.add_column('age', 'table', sdtype='numerical') model = GaussianCopulaSynthesizer(metadata) @@ -815,8 +815,8 @@ def reverse_transform(column_names, data): 'other': [7, 8, 9], }) metadata = Metadata.detect_from_dataframes({'table': data}) - metadata.update_column('table', 'key', sdtype='id', regex_format=r'\w_\d') - metadata.set_primary_key('table', 'key') + metadata.update_column('key', 'table', sdtype='id', regex_format=r'\w_\d') + metadata.set_primary_key('key', 'table') synth = GaussianCopulaSynthesizer(metadata) synth.add_custom_constraint_class(custom_constraint, 'custom') @@ -845,8 +845,8 @@ def test_timezone_aware_constraints(): metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'col1', sdtype='datetime') - metadata.add_column('table', 'col2', sdtype='datetime') + metadata.add_column('col1', 'table', sdtype='datetime') + metadata.add_column('col2', 'table', sdtype='datetime') my_constraint = { 'constraint_class': 'Inequality', diff --git a/tests/integration/single_table/test_ctgan.py b/tests/integration/single_table/test_ctgan.py index 9f892f878..0f87c5fe7 100644 --- a/tests/integration/single_table/test_ctgan.py +++ b/tests/integration/single_table/test_ctgan.py @@ -17,12 +17,12 @@ def test__estimate_num_columns(): # Setup metadata = Metadata() metadata.add_table('table') - metadata.add_column('table', 'numerical', sdtype='numerical') - metadata.add_column('table', 'categorical', sdtype='categorical') - metadata.add_column('table', 'categorical2', sdtype='categorical') - metadata.add_column('table', 'categorical3', sdtype='categorical') - metadata.add_column('table', 'datetime', sdtype='datetime') - metadata.add_column('table', 'boolean', sdtype='boolean') + metadata.add_column('numerical', 'table', sdtype='numerical') + metadata.add_column('categorical', 'table', sdtype='categorical') + metadata.add_column('categorical2', 'table', sdtype='categorical') + metadata.add_column('categorical3', 'table', sdtype='categorical') + metadata.add_column('datetime', 'table', sdtype='datetime') + metadata.add_column('boolean', 'table', sdtype='boolean') data = pd.DataFrame({ 'numerical': [0.1, 0.2, 0.3], 'datetime': ['2020-01-01', '2020-01-02', '2020-01-03'],