Split data, reorganize code base and add slack messaging

sdv-dev · Sep 16, 2024 · 3e218b6 · 3e218b6
1 parent e3f337a
commit 3e218b6
Show file tree

Hide file tree

Showing 11 changed files with 617 additions and 255 deletions.
diff --git a/.github/workflows/dtypes_benchmark.yml b/.github/workflows/dtypes_benchmark.yml
@@ -0,0 +1,66 @@
+name: Data Types Benchmark
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install invoke .[test]
+
+      - name: Create folder and JSON file
+        run: |
+          mkdir -p results
+          touch results/${{ matrix.python-version }}.json
+
+      # Run the benchmarking
+      - name: Benchmark Data Types
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        run: |
+          invoke benchmark-dtypes
+
+      # Upload the CSV files as artifacts
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: results-${{ matrix.python-version }}
+          path: results/*.csv
+
+  upload:
+    runs-on: ubuntu-latest
+    needs: build
+
+    steps:
+      # Download the artifacts
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: results/
+
+      # Generate the report
+      - name: Generate the report
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+        run: python -m tests.benchmark.utils
diff --git a/.github/workflows/release_notes.yml b/.github/workflows/release_notes.yml
@@ -1,52 +1,70 @@
-name: Release Notes Generator
+name: Data Types Benchmark
 
 on:
   workflow_dispatch:
     inputs:
-      branch:
-        description: 'Branch to merge release notes into.'
+      run_tests:
+        description: 'Run integration and unit tests'
         required: true
-        default: 'main'
-      version:
-        description:
-          'Version to use for the release. Must be in format: X.Y.Z.'
-      date:
-        description:
-          'Date of the release. Must be in format YYYY-MM-DD.'
+        type: boolean
+        default: true
 
 jobs:
-  releasenotesgeneration:
+  build:
     runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
     steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: ${{ matrix.python-version }}
 
       - name: Install dependencies
         run: |
-            python -m pip install --upgrade pip
-            python -m pip install requests==2.31.0
+          python -m pip install --upgrade pip
+          python -m pip install invoke .[test]
+
+      - name: Create folder and JSON file
+        run: |
+          mkdir -p results
+          touch results/${{ matrix.python-version }}.json
 
-      - name: Generate release notes
+      # Run the benchmarking
+      - name: Benchmark Data Types
         env:
-            GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }}
-        run: >
-            python scripts/release_notes_generator.py
-            -v ${{ inputs.version }}
-            -d ${{ inputs.date }}
-
-      - name: Create pull request
-        id: cpr
-        uses: peter-evans/create-pull-request@v4
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        run: |
+          invoke benchmark-dtypes
+
+      # Upload the CSV files as artifacts
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
         with:
-          token: ${{ secrets.GH_ACCESS_TOKEN }}
-          commit-message: Release notes for v${{ inputs.version }}
-          author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
-          committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
-          title: v${{ inputs.version }} Release Notes
-          body: "This is an auto-generated PR to update the release notes."
-          branch: release-notes
-          branch-suffix: short-commit-hash
-          base: ${{ inputs.branch }}
+          name: results-${{ matrix.python-version }}
+          path: results/*.csv
+
+  upload:
+    runs-on: ubuntu-latest
+    needs: build
+
+    steps:
+      # Download the artifacts
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: results/
+
+      # Generate the report
+      - name: Generate the report
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+        run: python -m tests.benchmark.utils
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,6 +64,8 @@ test = [
     'tomli>=2.0.0,<3',
     'pydrive',
     'pyarrow',
+    'gitpython',
+    'slack-sdk>=3.23,<4.0',
 ]
 pomegranate = ['pomegranate>=0.14.3,<0.15']
 dev = [

diff --git a/tests/_external/gdrive_utils.py b/tests/_external/gdrive_utils.py
@@ -5,8 +5,9 @@
 import os
 import pathlib
 import tempfile
-from functools import lru_cache
+from datetime import date
 
+import git
 import pandas as pd
 import yaml
 from pydrive.auth import GoogleAuth
@@ -15,6 +16,14 @@
 PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'
 
 
+def _generate_filename():
+    """Generate a filename with today's date and the commit id."""
+    repo = git.Repo(search_parent_directories=True)
+    commit_id = repo.head.object.hexsha
+    today = str(date.today())
+    return f'{today}-{commit_id}.xlsx'
+
+
 def _get_drive_client():
     tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
     if not tmp_credentials:
@@ -47,7 +56,24 @@ def _get_drive_client():
     return GoogleDrive(gauth)
 
 
-@lru_cache()
+def get_latest_file(folder_id):
+    """Get the latest file from the given Google Drive folder.
+
+    Args:
+        folder (str):
+            The string Google Drive folder ID.
+    """
+    drive = _get_drive_client()
+    drive_query = drive.ListFile({
+        'q': f"'{folder_id}' in parents and trashed=False",
+        'orderBy': 'modifiedDate desc',
+        'maxResults': 1
+    })
+    file_list = drive_query.GetList()
+    if len(file_list) > 0:
+        return file_list[0]
+
+
 def read_excel(file_id):
     """Read a file as an XLSX from Google Drive.
 
@@ -96,7 +122,11 @@ def save_to_gdrive(output_folder, results, output_filename=None):
         str:
             Google drive file id of uploaded file.
     """
+    if not output_filename:
+        output_filename = _generate_filename()
+
     output = io.BytesIO()
+
     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:  # pylint: disable=E0110
         for sheet_name, data in results.items():
             data.to_excel(writer, sheet_name=sheet_name, index=False)

diff --git a/tests/_external/slack_utils.py b/tests/_external/slack_utils.py
@@ -0,0 +1,42 @@
+"""Utility functions for Slack integration."""
+
+import os
+
+from slack_sdk import WebClient
+
+
+def _get_slack_client():
+    """Create an authenticated Slack client.
+
+    Returns:
+        WebClient:
+            An authenticated Slack WebClient instance.
+    """
+    token = os.getenv('SLACK_TOKEN')
+    client = WebClient(token=token)
+    return client
+
+
+def post_slack_message(channel, text):
+    """Post a message to a Slack channel.
+
+    Args:
+        channel (str):
+            The name of the channel to post to.
+        text (str):
+            The message to send to the channel.
+
+    Returns:
+        SlackResponse:
+            Response from Slack API call
+    """
+    client = _get_slack_client()
+    response = client.chat_postMessage(channel=channel, text=text)
+    if not response['ok']:
+        error = response.get('error', 'unknown_error')
+        msg = (
+            f'{error} occured trying to post message to {channel}'
+        )
+        raise RuntimeError(msg)
+
+    return response
diff --git a/tests/benchmark/numpy_dtypes.py b/tests/benchmark/numpy_dtypes.py
@@ -0,0 +1,87 @@
+import numpy as np
+import pandas as pd
+
+NUMPY_DTYPES = {
+    'np.int8': pd.DataFrame({
+        'np.int8': pd.Series([np.int8(1), np.int8(-1), np.int8(127)], dtype='int8')
+    }),
+    'np.int16': pd.DataFrame({
+        'np.int16': pd.Series([np.int16(2), np.int16(-2), np.int16(32767)], dtype='int16')
+    }),
+    'np.int32': pd.DataFrame({
+        'np.int32': pd.Series([np.int32(3), np.int32(-3), np.int32(2147483647)], dtype='int32')
+    }),
+    'np.int64': pd.DataFrame({
+        'np.int64': pd.Series([np.int64(4), np.int64(-4), np.int64(922)], dtype='int64')
+    }),
+    'np.uint8': pd.DataFrame({
+        'np.uint8': pd.Series([np.uint8(5), np.uint8(10), np.uint8(255)], dtype='uint8')
+    }),
+    'np.uint16': pd.DataFrame({
+        'np.uint16': pd.Series([np.uint16(6), np.uint16(20), np.uint16(65535)], dtype='uint16')
+    }),
+    'np.uint32': pd.DataFrame({
+        'np.uint32': pd.Series([np.uint32(7), np.uint32(30), np.uint32(42)], dtype='uint32')
+    }),
+    'np.uint64': pd.DataFrame({
+        'np.uint64': pd.Series([np.uint64(8), np.uint64(40), np.uint64(184467)], dtype='uint64')
+    }),
+    'np.float16': pd.DataFrame({
+        'np.float16': pd.Series(
+            [np.float16(9.1), np.float16(-9.1), np.float16(65.0)], dtype='float16'
+        )
+    }),
+    'np.float32': pd.DataFrame({
+        'np.float32': pd.Series(
+            [np.float32(1.2), np.float32(-1.2), np.float32(3.40)], dtype='float32'
+        )
+    }),
+    'np.float64': pd.DataFrame({
+        'np.float64': pd.Series(
+            [np.float64(1.3), np.float64(-11.3), np.float64(1.7)], dtype='float64'
+        )
+    }),
+    'np.complex64': pd.DataFrame({
+        'np.complex64': pd.Series(
+            [np.complex64(12 + 1j), np.complex64(-12 - 1j), np.complex64(3.4e38 + 1j)],
+            dtype='complex64',
+        )
+    }),
+    'np.complex128': pd.DataFrame({
+        'np.complex128': pd.Series(
+            [np.complex128(13 + 2j), np.complex128(-13 - 2j), np.complex128(1.7e308 + 2j)],
+            dtype='complex128',
+        )
+    }),
+    'np.bool': pd.DataFrame({
+        'np.bool': pd.Series([np.bool_(True), np.bool_(False), np.bool_(True)], dtype='bool')
+    }),
+    'np.object': pd.DataFrame({
+        'np.object': pd.Series(['object1', 'object2', 'object3'], dtype='object')
+    }),
+    'np.string': pd.DataFrame({
+        'np.string': pd.Series([
+            np.string_('string1'),
+            np.string_('string2'),
+            np.string_('string3'),
+        ])
+    }),
+    'np.unicode': pd.DataFrame({
+        'np.unicode': pd.Series(
+            [np.unicode_('unicode1'), np.unicode_('unicode2'), np.unicode_('unicode3')],
+            dtype='string',
+        )
+    }),
+    'np.datetime64': pd.DataFrame({
+        'np.datetime64': pd.Series([
+            np.datetime64('2023-01-01T00:00:00'),
+            np.datetime64('2024-01-01T00:00:00'),
+            np.datetime64('2025-01-01T00:00:00'),
+        ])
+    }),
+    'np.timedelta64': pd.DataFrame({
+        'np.timedelta64': pd.Series(
+            [np.timedelta64(1, 'D'), np.timedelta64(2, 'h'), np.timedelta64(3, 'm')],
+        )
+    }),
+}