sdv-dev · pvk-developer · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
@@ -0,0 +1,77 @@
+name: Data Types Benchmark
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  run_dtypes_benchmark:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install invoke .[test]
+
+      - name: Create folder and JSON file
+        run: |
+          mkdir -p results
+          touch results/${{ matrix.python-version }}.json
+
+      # Run the benchmarking
+      - name: Benchmark Data Types
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        run: |
+          invoke benchmark-dtypes
+
+      # Upload the CSV files as artifacts
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: results-${{ matrix.python-version }}
+          path: results/*.csv
+
+  generate_dtypes_report:
+    runs-on: ubuntu-latest
+    needs: run_dtypes_benchmark
+
+    steps:
+      # Download the artifacts
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: results/
+
+      # Set up Python 3.10
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies for report
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+
+      # Generate the report
+      - name: Generate the report
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+        run: python -m tests.benchmark.utils
@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.8', '3.12']
         os: [ubuntu-latest, windows-latest]
         include:
           - os: macos-latest
@@ -29,4 +29,9 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install invoke .[test]
     - name: Run integration tests
-      run: invoke integration
+      env:
+        PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+
+      run: |
+        invoke integration
+        invoke benchmark-dtypes
@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.8', '3.12']
         os: [ubuntu-latest, windows-latest]
         include:
           - os: macos-latest

@@ -1,52 +1,84 @@
-name: Release Notes Generator
+name: Data Types Benchmark
 
 on:
   workflow_dispatch:
     inputs:
-      branch:
-        description: 'Branch to merge release notes into.'
+      run_tests:
+        description: 'Run integration and unit tests'
         required: true
-        default: 'main'
-      version:
-        description:
-          'Version to use for the release. Must be in format: X.Y.Z.'
-      date:
-        description:
-          'Date of the release. Must be in format YYYY-MM-DD.'
+        type: boolean
+        default: true
 
 jobs:
-  releasenotesgeneration:
+  run_dtypes_benchmark:
     runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
     steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: ${{ matrix.python-version }}
 
       - name: Install dependencies
         run: |
-            python -m pip install --upgrade pip
-            python -m pip install requests==2.31.0
+          python -m pip install --upgrade pip
+          python -m pip install invoke .[test]
+
+      - name: Create folder and JSON file
+        run: |
+          mkdir -p results
+          touch results/${{ matrix.python-version }}.json
 
-      - name: Generate release notes
+      # Run the benchmarking
+      - name: Benchmark Data Types
         env:
-            GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }}
-        run: >
-            python scripts/release_notes_generator.py
-            -v ${{ inputs.version }}
-            -d ${{ inputs.date }}
-
-      - name: Create pull request
-        id: cpr
-        uses: peter-evans/create-pull-request@v4
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        run: |
+          invoke benchmark-dtypes
+
+      # Upload the json files as artifacts
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: results-${{ matrix.python-version }}
+          path: results/*.json
+
+  generate_dtypes_report:
+    runs-on: ubuntu-latest
+    needs: run_dtypes_benchmark
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      # Set up Python 3.10
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies for report
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+
+      # Download the artifacts
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
-          token: ${{ secrets.GH_ACCESS_TOKEN }}
-          commit-message: Release notes for v${{ inputs.version }}
-          author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
-          committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
-          title: v${{ inputs.version }} Release Notes
-          body: "This is an auto-generated PR to update the release notes."
-          branch: release-notes
-          branch-suffix: short-commit-hash
-          base: ${{ inputs.branch }}
+          path: results/
+
+      # Generate the report
+      - name: Generate the report
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+        run: python -m tests.benchmark.utils
@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.8', '3.12']
         os: [ubuntu-latest, windows-latest]
         include:
           - os: macos-latest

@@ -62,6 +62,10 @@ test = [
     'rundoc>=0.4.3,<0.5',
     'pytest-runner >= 2.11.1',
     'tomli>=2.0.0,<3',
+    'pydrive',
+    'pyarrow',
+    'gitpython',
+    'slack-sdk>=3.23,<4.0',
 ]
 pomegranate = ['pomegranate>=0.14.3,<0.15']
 dev = [
@@ -181,6 +185,7 @@ exclude = [
     ".tox",
     ".git",
     "__pycache__",
+    "*.ipynb",
     ".ipynb_checkpoints",
     "tasks.py",
 ]

@@ -9,6 +9,7 @@
 from pathlib import Path
 
 import pandas as pd
+from pandas.api.types import is_float, is_integer
 from pandas.core.tools.datetimes import _guess_datetime_format_for_array
 from rdt.transformers.utils import _GENERATORS
 
@@ -439,3 +440,11 @@ def get_possible_chars(regex, num_subpatterns=None):
         possible_chars += _get_chars_for_option(option, params)
 
     return possible_chars
+
+
+def _is_numerical(value):
+    """Determine if the input is a numerical type or not."""
+    try:
+        return is_integer(value) or is_float(value)
+    except Exception:
+        return False
@@ -36,7 +36,7 @@
 import numpy as np
 import pandas as pd
 
-from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type
+from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type, _is_numerical
 from sdv.constraints.base import Constraint
 from sdv.constraints.errors import (
     AggregateConstraintsError,
@@ -604,7 +604,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
         sdtype = metadata.columns.get(column_name, {}).get('sdtype')
         value = kwargs.get('value')
         if sdtype == 'numerical':
-            if not isinstance(value, (int, float)):
+            if not _is_numerical(value):
                 raise ConstraintMetadataError("'value' must be an int or float.")
 
         elif sdtype == 'datetime':
@@ -632,7 +632,7 @@ def _validate_init_inputs(column_name, value, relation):
         if relation not in ['>', '>=', '<', '<=']:
             raise ValueError('`relation` must be one of the following: `>`, `>=`, `<`, `<=`')
 
-        if not (isinstance(value, (int, float)) or value_is_datetime):
+        if not (_is_numerical(value) or value_is_datetime):
             raise ValueError('`value` must be a number or a string that represents a datetime.')
 
         if value_is_datetime and not isinstance(value, str):
@@ -1071,9 +1071,7 @@ def _validate_init_inputs(low_value, high_value):
         if values_are_datetimes and not values_are_strings:
             raise ValueError('Datetime must be represented as a string.')
 
-        values_are_numerical = bool(
-            isinstance(low_value, (int, float)) and isinstance(high_value, (int, float))
-        )
+        values_are_numerical = bool(_is_numerical(low_value) and _is_numerical(high_value))
         if not (values_are_numerical or values_are_datetimes):
             raise ValueError(
                 '``low_value`` and ``high_value`` must be a number or a string that '
@@ -1092,7 +1090,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
         high_value = kwargs.get('high_value')
         low_value = kwargs.get('low_value')
         if sdtype == 'numerical':
-            if not isinstance(high_value, (int, float)) or not isinstance(low_value, (int, float)):
+            if not _is_numerical(high_value) or not _is_numerical(low_value):
                 raise ConstraintMetadataError(
                     "Both 'high_value' and 'low_value' must be ints or floats"
                 )
@@ -1187,11 +1185,7 @@ def is_valid(self, table_data):
             self._operator(data, self._high_value),
             pd.isna(self._high_value),
         )
-
-        return np.logical_or(
-            np.logical_and(satisfy_low_bound, satisfy_high_bound),
-            pd.isna(data),
-        )
+        return (satisfy_low_bound & satisfy_high_bound) | pd.isna(data)
 
     def _transform(self, table_data):
         """Transform the table data.
@@ -1250,7 +1244,7 @@ def _reverse_transform(self, table_data):
             table_data[self._column_name] = data.round().astype(self._dtype)
 
         else:
-            table_data[self._column_name] = data.astype(self._dtype)
+            table_data[self._column_name] = data.astype(self._dtype, errors='ignore')
 
         table_data = table_data.drop(self._transformed_column, axis=1)
         return table_data

@@ -34,6 +34,11 @@ def integration(c):
     c.run('python -m pytest ./tests/integration --reruns 3')
 
 
+@task
+def benchmark_dtypes(c):
+    c.run('python -m pytest ./tests/benchmark/supported_dtypes_benchmark.py')
+
+
 def _get_minimum_versions(dependencies, python_version):
     min_versions = {}
     for dependency in dependencies:

@@ -0,0 +1 @@
+"""External utility functions."""