sdv-dev · fealho · Aug 22, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 21, 2024
@@ -497,8 +497,14 @@ def _validate_all_tables_connected(self, parent_map, child_map):
                 f'The relationships in the dataset are disjointed. {table_msg}'
             )
 
-    def _detect_relationships(self):
-        """Automatically detect relationships between tables."""
+    def _detect_relationships(self, data=None):
+        """Automatically detect relationships between tables.
+
+        Args:
+            data (dict):
+                Dictionary of table names to dataframes.
+                NOTE: this is only used in SDV-Enterprise.
+        """
         for parent_candidate in self.tables.keys():
             primary_key = self.tables[parent_candidate].primary_key
             for child_candidate in self.tables.keys() - {parent_candidate}:
@@ -552,7 +558,7 @@ def detect_from_dataframes(self, data):
         for table_name, dataframe in data.items():
             self.detect_table_from_dataframe(table_name, dataframe)
 
-        self._detect_relationships()
+        self._detect_relationships(data)
 
     def detect_table_from_csv(self, table_name, filepath, read_csv_parameters=None):
         """Detect the metadata for a table from a csv file.
@@ -579,7 +585,9 @@ def detect_from_csvs(self, folder_name, read_csv_parameters=None):
         Args:
             folder_name (str):
                 Name of the folder to detect the metadata from.
-
+            read_csv_parameters (dict):
+                A python dictionary of with string and value accepted by ``pandas.read_csv``
+                function. Defaults to ``None``.
         """
         folder_path = Path(folder_name)
 
@@ -591,11 +599,13 @@ def detect_from_csvs(self, folder_name, read_csv_parameters=None):
         if not csv_files:
             raise ValueError(f"No CSV files detected in the folder '{folder_name}'.")
 
+        data = {}
         for csv_file in csv_files:
             table_name = csv_file.stem
-            self.detect_table_from_csv(table_name, str(csv_file), read_csv_parameters)
+            data[table_name] = _load_data_from_csv(csv_file, read_csv_parameters)
+            self.detect_table_from_dataframe(table_name, data[table_name])
 
-        self._detect_relationships()
+        self._detect_relationships(data)
 
     def set_primary_key(self, table_name, column_name):
         """Set the primary key of a table.

@@ -2380,14 +2380,15 @@ def test_detect_table_from_csv_table_already_exists(self):
         with pytest.raises(InvalidMetadataError, match=error_message):
             metadata.detect_table_from_csv('table', 'path.csv')
 
-    def test_detect_from_csvs(self, tmp_path):
+    @patch('sdv.metadata.multi_table._load_data_from_csv')
+    def test_detect_from_csvs(self, load_data_mock, tmp_path):
         """Test the ``detect_from_csvs`` method.
 
         The method should call ``detect_table_from_csv`` for each csv in the folder.
         """
         # Setup
         instance = MultiTableMetadata()
-        instance.detect_table_from_csv = Mock()
+        instance.detect_table_from_dataframe = Mock()
         instance._detect_relationships = Mock()
 
         data1 = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
@@ -2398,6 +2399,14 @@ def test_detect_from_csvs(self, tmp_path):
         data1.to_csv(filepath1, index=False)
         data2.to_csv(filepath2, index=False)
 
+        def load_data_side_effect(filepath, _):
+            if filepath.name == 'table1.csv':
+                return data1
+            elif filepath.name == 'table2.csv':
+                return data2
+
+        load_data_mock.side_effect = load_data_side_effect
+
         json_filepath = tmp_path / 'not_csv.json'
         with open(json_filepath, 'w') as json_file:
             json_file.write('{"key": "value"}')
@@ -2406,15 +2415,24 @@ def test_detect_from_csvs(self, tmp_path):
         instance.detect_from_csvs(tmp_path)
 
         # Assert
-        expected_calls = [
-            call('table1', str(filepath1), None),
-            call('table2', str(filepath2), None),
+        expected_calls_load_data = [
+            call(filepath1, None),
+            call(filepath2, None),
         ]
+        load_data_mock.assert_has_calls(expected_calls_load_data, any_order=True)
 
-        instance.detect_table_from_csv.assert_has_calls(expected_calls, any_order=True)
-        assert instance.detect_table_from_csv.call_count == 2
+        expected_detect_calls = [
+            call('table1', data1),
+            call('table2', data2),
+        ]
+        instance.detect_table_from_dataframe.assert_has_calls(expected_detect_calls, any_order=True)
+        assert instance.detect_table_from_dataframe.call_count == 2
 
         instance._detect_relationships.assert_called_once()
+        table1 = instance._detect_relationships.call_args[0][0]['table1']
+        table2 = instance._detect_relationships.call_args[0][0]['table2']
+        pd.testing.assert_frame_equal(table1, data1)
+        pd.testing.assert_frame_equal(table2, data2)
 
     def test_detect_from_csvs_no_csv(self, tmp_path):
         """Test the ``detect_from_csvs`` method with no csv file in the folder."""
@@ -2426,13 +2444,11 @@ def test_detect_from_csvs_no_csv(self, tmp_path):
             json_file.write('{"key": "value"}')
 
         # Run and Assert
-        expected_message = re.escape("No CSV files detected in the folder '{}'.".format(tmp_path))
+        expected_message = re.escape(f"No CSV files detected in the folder '{tmp_path}'.")
         with pytest.raises(ValueError, match=expected_message):
             instance.detect_from_csvs(tmp_path)
 
-        expected_message_folder = re.escape(
-            "The folder '{}' does not exist.".format('not_a_folder')
-        )
+        expected_message_folder = re.escape(f"The folder '{'not_a_folder'}' does not exist.")
         with pytest.raises(ValueError, match=expected_message_folder):
             instance.detect_from_csvs('not_a_folder')
 
@@ -2516,14 +2532,15 @@ def test_detect_from_dataframes(self):
 
         guests_table = pd.DataFrame()
         hotels_table = pd.DataFrame()
+        data = {'guests': guests_table, 'hotels': hotels_table}
 
         # Run
-        metadata.detect_from_dataframes(data={'guests': guests_table, 'hotels': hotels_table})
+        metadata.detect_from_dataframes(data)
 
         # Assert
         metadata.detect_table_from_dataframe.assert_any_call('guests', guests_table)
         metadata.detect_table_from_dataframe.assert_any_call('hotels', hotels_table)
-        metadata._detect_relationships.assert_called_once()
+        metadata._detect_relationships.assert_called_once_with(data)
 
     def test_detect_from_dataframes_no_dataframes(self):
         """Test ``detect_from_dataframes`` with no dataframes in the input.