Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark Supported Data Types #2206

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
6ae829d
Add benchmarking for data types
pvk-developer Sep 6, 2024
c7f08b0
Add dependency
pvk-developer Sep 6, 2024
014a68f
Fix lint
pvk-developer Sep 6, 2024
1934059
Add to integration workflow
pvk-developer Sep 6, 2024
633447e
Fix constraints evaluating int or float instances
pvk-developer Sep 6, 2024
ba10af8
Update benchmark workflow to ignore errors
pvk-developer Sep 9, 2024
1aff4d9
Fix lint
pvk-developer Sep 9, 2024
ecbc660
Fix lint from ruff
pvk-developer Sep 9, 2024
e270e16
Split gdrive utils
pvk-developer Sep 10, 2024
e3f337a
Fix lint
pvk-developer Sep 10, 2024
3e218b6
Split data, reorganize code base and add slack messaging
pvk-developer Sep 16, 2024
1cdf943
Fix: install the dependencies when running the report
pvk-developer Sep 16, 2024
e94a4bf
Use args for utils
pvk-developer Sep 17, 2024
dae1d47
Fix lint
pvk-developer Sep 17, 2024
c484891
Fix parseargs
pvk-developer Sep 17, 2024
76dc7ef
Add credentials
pvk-developer Sep 17, 2024
23cfbb6
Improve slack message
pvk-developer Sep 17, 2024
0db3da5
Fix workflow uploading spreadsheet
pvk-developer Sep 17, 2024
0e6ab21
Fix message formatting
pvk-developer Sep 17, 2024
5c6e272
Fix: Typo code
pvk-developer Sep 17, 2024
c8fdecc
Add summary and improve slack messaging
pvk-developer Sep 18, 2024
b25a9ce
Rename method
pvk-developer Sep 18, 2024
b53cb5a
Rename method in utility.py
pvk-developer Sep 18, 2024
6f2ff18
Fix report generation
pvk-developer Sep 18, 2024
4bafbda
Change artifact download order
pvk-developer Sep 18, 2024
f99e13e
Fix saving to csv / json. Improve messaging
pvk-developer Sep 20, 2024
a6ae49e
Improve summary report
pvk-developer Sep 20, 2024
7b3bbfa
Add excluded tests
pvk-developer Sep 20, 2024
5059b7b
Reduce workflows while testing
pvk-developer Sep 20, 2024
e4cd822
Add py38 failing combinations
pvk-developer Sep 20, 2024
eb24dc3
Remove args
pvk-developer Sep 20, 2024
0b637e4
Remove summary for now
pvk-developer Sep 20, 2024
3269b8e
Try channel with #
pvk-developer Sep 20, 2024
de9b4ee
Do one slack message instead
pvk-developer Sep 20, 2024
bd52d6e
Remove response from comparing dfs
pvk-developer Sep 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,9 @@ jobs:
python -m pip install --upgrade pip
python -m pip install invoke .[test]
- name: Run integration tests
run: invoke integration
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}

run: |
invoke integration
invoke benchmark-dtypes
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ test = [
'rundoc>=0.4.3,<0.5',
'pytest-runner >= 2.11.1',
'tomli>=2.0.0,<3',
'pydrive',
]
pomegranate = ['pomegranate>=0.14.3,<0.15']
dev = [
Expand Down
9 changes: 9 additions & 0 deletions sdv/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path

import pandas as pd
from pandas.api.types import is_float, is_integer
from pandas.core.tools.datetimes import _guess_datetime_format_for_array
from rdt.transformers.utils import _GENERATORS

Expand Down Expand Up @@ -439,3 +440,11 @@ def get_possible_chars(regex, num_subpatterns=None):
possible_chars += _get_chars_for_option(option, params)

return possible_chars


def _is_numerical(value):
"""Determine if the input is a numerical type or not."""
try:
return is_integer(value) or is_float(value)
except Exception:
return False
12 changes: 5 additions & 7 deletions sdv/constraints/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import numpy as np
import pandas as pd

from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type
from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type, _is_numerical
from sdv.constraints.base import Constraint
from sdv.constraints.errors import (
AggregateConstraintsError,
Expand Down Expand Up @@ -604,7 +604,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
sdtype = metadata.columns.get(column_name, {}).get('sdtype')
value = kwargs.get('value')
if sdtype == 'numerical':
if not isinstance(value, (int, float)):
if not _is_numerical(value):
raise ConstraintMetadataError("'value' must be an int or float.")

elif sdtype == 'datetime':
Expand Down Expand Up @@ -632,7 +632,7 @@ def _validate_init_inputs(column_name, value, relation):
if relation not in ['>', '>=', '<', '<=']:
raise ValueError('`relation` must be one of the following: `>`, `>=`, `<`, `<=`')

if not (isinstance(value, (int, float)) or value_is_datetime):
if not (_is_numerical(value) or value_is_datetime):
raise ValueError('`value` must be a number or a string that represents a datetime.')

if value_is_datetime and not isinstance(value, str):
Expand Down Expand Up @@ -1071,9 +1071,7 @@ def _validate_init_inputs(low_value, high_value):
if values_are_datetimes and not values_are_strings:
raise ValueError('Datetime must be represented as a string.')

values_are_numerical = bool(
isinstance(low_value, (int, float)) and isinstance(high_value, (int, float))
)
values_are_numerical = bool(_is_numerical(low_value) and _is_numerical(high_value))
if not (values_are_numerical or values_are_datetimes):
raise ValueError(
'``low_value`` and ``high_value`` must be a number or a string that '
Expand All @@ -1092,7 +1090,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
high_value = kwargs.get('high_value')
low_value = kwargs.get('low_value')
if sdtype == 'numerical':
if not isinstance(high_value, (int, float)) or not isinstance(low_value, (int, float)):
if not _is_numerical(high_value) or not _is_numerical(low_value):
raise ConstraintMetadataError(
"Both 'high_value' and 'low_value' must be ints or floats"
)
Expand Down
5 changes: 5 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def integration(c):
c.run('python -m pytest ./tests/integration --reruns 3')


@task
def benchmark_dtypes(c):
c.run('python -m pytest ./tests/benchmark/supported_dtypes_benchmark.py')


def _get_minimum_versions(dependencies, python_version):
min_versions = {}
for dependency in dependencies:
Expand Down
1 change: 1 addition & 0 deletions tests/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""SDV benchmarking module."""
Loading
Loading