Split gdrive utils

sdv-dev · Sep 10, 2024 · e270e16 · e270e16
1 parent ecbc660
commit e270e16
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 115 deletions.
diff --git a/tests/_external/__init__.py b/tests/_external/__init__.py
@@ -0,0 +1 @@
+"""External utility functions."""
diff --git a/tests/_external/gdrive_utils.py b/tests/_external/gdrive_utils.py
@@ -0,0 +1,110 @@
+"""Google Drive utils."""
+
+import io
+import json
+import os
+import pathlib
+import tempfile
+from functools import lru_cache
+
+import pandas as pd
+import yaml
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+
+PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'
+
+
+def _get_drive_client():
+    tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
+    if not tmp_credentials:
+        gauth = GoogleAuth()
+        gauth.LocalWebserverAuth()
+    else:
+        with tempfile.TemporaryDirectory() as tempdir:
+            credentials_file_path = pathlib.Path(tempdir) / 'credentials.json'
+            credentials_file_path.write_text(tmp_credentials)
+
+            credentials = json.loads(tmp_credentials)
+
+            settings = {
+                'client_config_backend': 'settings',
+                'client_config': {
+                    'client_id': credentials['client_id'],
+                    'client_secret': credentials['client_secret'],
+                },
+                'save_credentials': True,
+                'save_credentials_backend': 'file',
+                'save_credentials_file': str(credentials_file_path),
+                'get_refresh_token': True,
+            }
+            settings_file = pathlib.Path(tempdir) / 'settings.yaml'
+            settings_file.write_text(yaml.safe_dump(settings))
+
+            gauth = GoogleAuth(str(settings_file))
+            gauth.LocalWebserverAuth()
+
+    return GoogleDrive(gauth)
+
+
+@lru_cache()
+def read_excel(file_id):
+    """Read a file as an XLSX from Google Drive.
+
+    Args:
+        file_id (str):
+            The ID of the file to load.
+
+    Returns:
+        pd.DataFrame or dict[pd.DataFrame]:
+            A DataFrame containing the body of file if single sheet else dict of DataFrames one for
+            each sheet
+
+    """
+    client = _get_drive_client()
+    drive_file = client.CreateFile({'id': file_id})
+    xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+    drive_file.FetchContent(mimetype=xlsx_mime)
+    return pd.read_excel(drive_file.content, sheet_name=None)
+
+
+def _set_column_width(writer, results, sheet_name):
+    for column in results:
+        column_width = max(results[column].astype(str).map(len).max(), len(column))
+        col_idx = results.columns.get_loc(column)
+        writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)
+
+
+def save_to_gdrive(output_folder, results, output_filename=None):
+    """Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).
+
+    Given the output folder id (google drive folder id), store the given ``results`` as
+    ``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the
+    current date and commit as name.
+
+    Args:
+        output_folder (str):
+            String representing a google drive folder id.
+        results (pd.DataFrame or dict[pd.DataFrame]):
+            Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for
+            storage in one ``xlsx`` file.
+        output_filename (str, optional):
+            String representing the filename to be used for the results spreadsheet. If None,
+            uses to the current date and commit as the name. Defaults to None.
+
+    Returns:
+        str:
+            Google drive file id of uploaded file.
+    """
+    output = io.BytesIO()
+    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:  # pylint: disable=E0110
+        for sheet_name, data in results.items():
+            data.to_excel(writer, sheet_name=sheet_name, index=False)
+            _set_column_width(writer, data, sheet_name)
+
+    file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
+    drive = _get_drive_client()
+    drive_file = drive.CreateFile(file_config)
+    drive_file.content = output
+    drive_file.Upload({'convert': True})
+    return drive_file['id']
diff --git a/tests/benchmark/supported_dtypes_benchmark.py b/tests/benchmark/supported_dtypes_benchmark.py
@@ -368,7 +368,10 @@ def test_metadata_detection(dtype, data):
         assert result == previous_result, assertion_message
 
 
-@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
+@pytest.mark.parametrize(
+    'dtype, data',
+    {**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
+)
 def test_metadata_validate_data(dtype, data):
     """Test the validation of data using `SingleTableMetadata`.
 
@@ -406,7 +409,10 @@ def test_metadata_validate_data(dtype, data):
         assert result == previous_result, assertion_message
 
 
-@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
+@pytest.mark.parametrize(
+    'dtype, data',
+    {**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
+)
 def test_fit_and_sample_synthesizer(dtype, data):
     """Test fitting and sampling a synthesizer for different data types.
 
@@ -567,7 +573,10 @@ def _create_multi_column_constraint_data_and_metadata(constraint, data, dtype, s
 @pytest.mark.parametrize(
     'constraint_name, constraint', SINGLE_COLUMN_PREDEFINED_CONSTRAINTS.items()
 )
-@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
+@pytest.mark.parametrize(
+    'dtype, data',
+    {**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
+)
 def test_fit_and_sample_single_column_constraints(constraint_name, constraint, dtype, data):
     """Test fitting and sampling with single-column constraints for various data types.
 
@@ -638,7 +647,10 @@ def test_fit_and_sample_single_column_constraints(constraint_name, constraint, d
 
 
 @pytest.mark.parametrize('constraint_name, constraint', MULTI_COLUMN_PREDEFINED_CONSTRAINTS.items())
-@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
+@pytest.mark.parametrize(
+    'dtype, data',
+    {**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
+)
 def test_fit_and_sample_multi_column_constraints(constraint_name, constraint, dtype, data):
     """Test fitting and sampling with multi-column constraints for various data types.
 

diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py
@@ -1,24 +1,13 @@
-"""Google Drive utils."""
-
-import io
-import json
-import os
-import pathlib
+"""Utility functions for the benchmarking."""
 import sys
-import tempfile
-from functools import lru_cache
 
-import pandas as pd
-import yaml
-from pydrive.auth import GoogleAuth
-from pydrive.drive import GoogleDrive
+from tests._external.gdrive_utils import read_excel
 
-PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'
 BENCHMARK_FILE_ID = '1mrvIok6G5P0x88m2_TjOtqcQ-p4iEAk8PTuG6Hpp5Uk'
 
 
-def detect_python_version():
-    """Detect python version being used."""
+def get_python_version():
+    """Get the current python version."""
     python_version = sys.version_info
     python_version = f'{python_version.major}.{python_version.minor}'
     return python_version
@@ -27,103 +16,8 @@ def detect_python_version():
 def get_previous_result(dtype, method):
     """Return previous result for a given ``dtype`` and method."""
     data = read_excel(BENCHMARK_FILE_ID)
-    python_version = detect_python_version()
+    python_version = get_python_version()
     df = data[python_version]
     filtered_row = df[df['dtype'] == dtype]
     value = filtered_row[method].to_numpy()[0]
     return value
-
-
-def _get_drive_client():
-    tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
-    if not tmp_credentials:
-        gauth = GoogleAuth()
-        gauth.LocalWebserverAuth()
-    else:
-        with tempfile.TemporaryDirectory() as tempdir:
-            credentials_file_path = pathlib.Path(tempdir) / 'credentials.json'
-            credentials_file_path.write_text(tmp_credentials)
-
-            credentials = json.loads(tmp_credentials)
-
-            settings = {
-                'client_config_backend': 'settings',
-                'client_config': {
-                    'client_id': credentials['client_id'],
-                    'client_secret': credentials['client_secret'],
-                },
-                'save_credentials': True,
-                'save_credentials_backend': 'file',
-                'save_credentials_file': str(credentials_file_path),
-                'get_refresh_token': True,
-            }
-            settings_file = pathlib.Path(tempdir) / 'settings.yaml'
-            settings_file.write_text(yaml.safe_dump(settings))
-
-            gauth = GoogleAuth(str(settings_file))
-            gauth.LocalWebserverAuth()
-
-    return GoogleDrive(gauth)
-
-
-@lru_cache()
-def read_excel(file_id):
-    """Read a file as an XLSX from Google Drive.
-
-    Args:
-        file_id (str):
-            The ID of the file to load.
-
-    Returns:
-        pd.DataFrame or dict[pd.DataFrame]:
-            A DataFrame containing the body of file if single sheet else dict of DataFrames one for
-            each sheet
-
-    """
-    client = _get_drive_client()
-    drive_file = client.CreateFile({'id': file_id})
-    xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
-    drive_file.FetchContent(mimetype=xlsx_mime)
-    return pd.read_excel(drive_file.content, sheet_name=None)
-
-
-def _set_column_width(writer, results, sheet_name):
-    for column in results:
-        column_width = max(results[column].astype(str).map(len).max(), len(column))
-        col_idx = results.columns.get_loc(column)
-        writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)
-
-
-def save_to_gdrive(output_folder, results, output_filename=None):
-    """Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).
-
-    Given the output folder id (google drive folder id), store the given ``results`` as
-    ``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the
-    current date and commit as name.
-
-    Args:
-        output_folder (str):
-            String representing a google drive folder id.
-        results (pd.DataFrame or dict[pd.DataFrame]):
-            Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for
-            storage in one ``xlsx`` file.
-        output_filename (str, optional):
-            String representing the filename to be used for the results spreadsheet. If None,
-            uses to the current date and commit as the name. Defaults to None.
-
-    Returns:
-        str:
-            Google drive file id of uploaded file.
-    """
-    output = io.BytesIO()
-    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:  # pylint: disable=E0110
-        for sheet_name, data in results.items():
-            data.to_excel(writer, sheet_name=sheet_name, index=False)
-            _set_column_width(writer, data, sheet_name)
-
-    file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
-    drive = _get_drive_client()
-    drive_file = drive.CreateFile(file_config)
-    drive_file.content = output
-    drive_file.Upload({'convert': True})
-    return drive_file['id']