Skip to content

Commit

Permalink
Split gdrive utils
Browse files Browse the repository at this point in the history
  • Loading branch information
pvk-developer committed Sep 10, 2024
1 parent ecbc660 commit e270e16
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 115 deletions.
1 change: 1 addition & 0 deletions tests/_external/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""External utility functions."""
110 changes: 110 additions & 0 deletions tests/_external/gdrive_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Google Drive utils."""

import io
import json
import os
import pathlib
import tempfile
from functools import lru_cache

import pandas as pd
import yaml
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'


def _get_drive_client():
tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
if not tmp_credentials:
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
else:
with tempfile.TemporaryDirectory() as tempdir:
credentials_file_path = pathlib.Path(tempdir) / 'credentials.json'
credentials_file_path.write_text(tmp_credentials)

credentials = json.loads(tmp_credentials)

settings = {
'client_config_backend': 'settings',
'client_config': {
'client_id': credentials['client_id'],
'client_secret': credentials['client_secret'],
},
'save_credentials': True,
'save_credentials_backend': 'file',
'save_credentials_file': str(credentials_file_path),
'get_refresh_token': True,
}
settings_file = pathlib.Path(tempdir) / 'settings.yaml'
settings_file.write_text(yaml.safe_dump(settings))

gauth = GoogleAuth(str(settings_file))
gauth.LocalWebserverAuth()

return GoogleDrive(gauth)


@lru_cache()
def read_excel(file_id):
"""Read a file as an XLSX from Google Drive.
Args:
file_id (str):
The ID of the file to load.
Returns:
pd.DataFrame or dict[pd.DataFrame]:
A DataFrame containing the body of file if single sheet else dict of DataFrames one for
each sheet
"""
client = _get_drive_client()
drive_file = client.CreateFile({'id': file_id})
xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
drive_file.FetchContent(mimetype=xlsx_mime)
return pd.read_excel(drive_file.content, sheet_name=None)


def _set_column_width(writer, results, sheet_name):
for column in results:
column_width = max(results[column].astype(str).map(len).max(), len(column))
col_idx = results.columns.get_loc(column)
writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)


def save_to_gdrive(output_folder, results, output_filename=None):
"""Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).
Given the output folder id (google drive folder id), store the given ``results`` as
``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the
current date and commit as name.
Args:
output_folder (str):
String representing a google drive folder id.
results (pd.DataFrame or dict[pd.DataFrame]):
Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for
storage in one ``xlsx`` file.
output_filename (str, optional):
String representing the filename to be used for the results spreadsheet. If None,
uses to the current date and commit as the name. Defaults to None.
Returns:
str:
Google drive file id of uploaded file.
"""
output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110
for sheet_name, data in results.items():
data.to_excel(writer, sheet_name=sheet_name, index=False)
_set_column_width(writer, data, sheet_name)

file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
drive = _get_drive_client()
drive_file = drive.CreateFile(file_config)
drive_file.content = output
drive_file.Upload({'convert': True})
return drive_file['id']
20 changes: 16 additions & 4 deletions tests/benchmark/supported_dtypes_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,10 @@ def test_metadata_detection(dtype, data):
assert result == previous_result, assertion_message


@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
@pytest.mark.parametrize(
'dtype, data',
{**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
)
def test_metadata_validate_data(dtype, data):
"""Test the validation of data using `SingleTableMetadata`.
Expand Down Expand Up @@ -406,7 +409,10 @@ def test_metadata_validate_data(dtype, data):
assert result == previous_result, assertion_message


@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
@pytest.mark.parametrize(
'dtype, data',
{**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
)
def test_fit_and_sample_synthesizer(dtype, data):
"""Test fitting and sampling a synthesizer for different data types.
Expand Down Expand Up @@ -567,7 +573,10 @@ def _create_multi_column_constraint_data_and_metadata(constraint, data, dtype, s
@pytest.mark.parametrize(
'constraint_name, constraint', SINGLE_COLUMN_PREDEFINED_CONSTRAINTS.items()
)
@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
@pytest.mark.parametrize(
'dtype, data',
{**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
)
def test_fit_and_sample_single_column_constraints(constraint_name, constraint, dtype, data):
"""Test fitting and sampling with single-column constraints for various data types.
Expand Down Expand Up @@ -638,7 +647,10 @@ def test_fit_and_sample_single_column_constraints(constraint_name, constraint, d


@pytest.mark.parametrize('constraint_name, constraint', MULTI_COLUMN_PREDEFINED_CONSTRAINTS.items())
@pytest.mark.parametrize('dtype, data', {**PANDAS_DTYPES, **NUMPY_DTYPES}.items())
@pytest.mark.parametrize(
'dtype, data',
{**PANDAS_DTYPES, **NUMPY_DTYPES, **PYARROW_DTYPES}.items()
)
def test_fit_and_sample_multi_column_constraints(constraint_name, constraint, dtype, data):
"""Test fitting and sampling with multi-column constraints for various data types.
Expand Down
116 changes: 5 additions & 111 deletions tests/benchmark/utils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,13 @@
"""Google Drive utils."""

import io
import json
import os
import pathlib
"""Utility functions for the benchmarking."""
import sys
import tempfile
from functools import lru_cache

import pandas as pd
import yaml
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from tests._external.gdrive_utils import read_excel

PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'
BENCHMARK_FILE_ID = '1mrvIok6G5P0x88m2_TjOtqcQ-p4iEAk8PTuG6Hpp5Uk'


def detect_python_version():
"""Detect python version being used."""
def get_python_version():
"""Get the current python version."""
python_version = sys.version_info
python_version = f'{python_version.major}.{python_version.minor}'
return python_version
Expand All @@ -27,103 +16,8 @@ def detect_python_version():
def get_previous_result(dtype, method):
"""Return previous result for a given ``dtype`` and method."""
data = read_excel(BENCHMARK_FILE_ID)
python_version = detect_python_version()
python_version = get_python_version()
df = data[python_version]
filtered_row = df[df['dtype'] == dtype]
value = filtered_row[method].to_numpy()[0]
return value


def _get_drive_client():
tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
if not tmp_credentials:
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
else:
with tempfile.TemporaryDirectory() as tempdir:
credentials_file_path = pathlib.Path(tempdir) / 'credentials.json'
credentials_file_path.write_text(tmp_credentials)

credentials = json.loads(tmp_credentials)

settings = {
'client_config_backend': 'settings',
'client_config': {
'client_id': credentials['client_id'],
'client_secret': credentials['client_secret'],
},
'save_credentials': True,
'save_credentials_backend': 'file',
'save_credentials_file': str(credentials_file_path),
'get_refresh_token': True,
}
settings_file = pathlib.Path(tempdir) / 'settings.yaml'
settings_file.write_text(yaml.safe_dump(settings))

gauth = GoogleAuth(str(settings_file))
gauth.LocalWebserverAuth()

return GoogleDrive(gauth)


@lru_cache()
def read_excel(file_id):
"""Read a file as an XLSX from Google Drive.
Args:
file_id (str):
The ID of the file to load.
Returns:
pd.DataFrame or dict[pd.DataFrame]:
A DataFrame containing the body of file if single sheet else dict of DataFrames one for
each sheet
"""
client = _get_drive_client()
drive_file = client.CreateFile({'id': file_id})
xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
drive_file.FetchContent(mimetype=xlsx_mime)
return pd.read_excel(drive_file.content, sheet_name=None)


def _set_column_width(writer, results, sheet_name):
for column in results:
column_width = max(results[column].astype(str).map(len).max(), len(column))
col_idx = results.columns.get_loc(column)
writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)


def save_to_gdrive(output_folder, results, output_filename=None):
"""Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).
Given the output folder id (google drive folder id), store the given ``results`` as
``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the
current date and commit as name.
Args:
output_folder (str):
String representing a google drive folder id.
results (pd.DataFrame or dict[pd.DataFrame]):
Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for
storage in one ``xlsx`` file.
output_filename (str, optional):
String representing the filename to be used for the results spreadsheet. If None,
uses to the current date and commit as the name. Defaults to None.
Returns:
str:
Google drive file id of uploaded file.
"""
output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110
for sheet_name, data in results.items():
data.to_excel(writer, sheet_name=sheet_name, index=False)
_set_column_width(writer, data, sheet_name)

file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
drive = _get_drive_client()
drive_file = drive.CreateFile(file_config)
drive_file.content = output
drive_file.Upload({'convert': True})
return drive_file['id']

0 comments on commit e270e16

Please sign in to comment.