edgi-govdata-archiving · Mr0grog · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/scripts/lib/constants.py b/scripts/lib/constants.py
@@ -4,6 +4,12 @@
     'corp': 3,
 }
 
+ZOOM_ROLES = {
+    'owner': 0,
+    'admin': 1,
+    'member': 2,
+}
+
 VIDEO_CATEGORY_IDS = {
     'Film & Animation': 1,
     'Autos & Vehicles': 2,
@@ -36,4 +42,4 @@
     'Shorts': 42,
     'Shows': 43,
     'Trailers': 44,
-}
+}
diff --git a/scripts/upload_zoom_recordings.py b/scripts/upload_zoom_recordings.py
@@ -29,60 +29,140 @@
 import os
 import re
 import requests
+import subprocess
 import sys
 import tempfile
+from typing import Dict
 from urllib.parse import urlparse
 from zoomus import ZoomClient
-from lib.constants import USER_TYPES, VIDEO_CATEGORY_IDS
+from lib.constants import VIDEO_CATEGORY_IDS, ZOOM_ROLES
 from lib.youtube import get_youtube_client, upload_video, add_video_to_playlist, validate_youtube_credentials
 
 YOUTUBE_CREDENTIALS_PATH = '.youtube-upload-credentials.json'
 ZOOM_CLIENT_ID = os.environ['EDGI_ZOOM_CLIENT_ID']
 ZOOM_CLIENT_SECRET = os.environ['EDGI_ZOOM_CLIENT_SECRET']
 ZOOM_ACCOUNT_ID = os.environ['EDGI_ZOOM_ACCOUNT_ID']
 
-def is_truthy(x): return x.lower() in ['true', '1', 'y', 'yes']
-ZOOM_DELETE_AFTER_UPLOAD = is_truthy(os.environ.get('EDGI_ZOOM_DELETE_AFTER_UPLOAD', ''))
-DRY_RUN = is_truthy(os.environ.get('EDGI_DRY_RUN', ''))
-
 MEETINGS_TO_RECORD = ['EDGI Community Standup']
 DEFAULT_YOUTUBE_PLAYLIST = 'Uploads from Zoom'
 DEFAULT_YOUTUBE_CATEGORY = 'Science & Technology'
 DEFAULT_VIDEO_LICENSE = 'creativeCommon'
 DO_FILTER = False
 
-client = ZoomClient(ZOOM_CLIENT_ID, ZOOM_CLIENT_SECRET, ZOOM_ACCOUNT_ID)
+# Ignore users with names that match these patterns when determining if a
+# meeting has any participants and its recordings should be preserved.
+ZOOM_IGNORE_USER_NAMES = (
+    # The otter.ai notetaker bot is always present in most meetings.
+    re.compile(r'Otter\.ai', re.I),
+)
+
+
+def is_truthy(x):
+    return x.lower() in ['true', '1', 'y', 'yes']
+
+
+ZOOM_DELETE_AFTER_UPLOAD = is_truthy(os.environ.get('EDGI_ZOOM_DELETE_AFTER_UPLOAD', ''))
+DRY_RUN = is_truthy(os.environ.get('EDGI_DRY_RUN', ''))
+
+
+class ZoomError(Exception):
+    def __init__(self, response, message=None):
+        try:
+            data = response.json()
+        except Exception:
+            data = {}
 
-# Get main account, which should be 'pro'
-pro_users = [user for user in client.user.list().json()['users'] if user['type'] >= USER_TYPES['pro'] ]
-user_id = pro_users[0]['id']
+        if not message:
+            message = data.pop('message', 'Zoom API error!')
 
-def fix_date(date_string):
+        data['http_status'] = response.status_code
+        full_message = f'{message} ({data!r}) Check the docs for details: https://developers.zoom.us/docs/api/.'
+        super.__init__(full_message)
+
+    @classmethod
+    def is_error(cls, response):
+        return response.status_code >= 400
+
+    @classmethod
+    def raise_if_error(cls, response, message=None):
+        if cls.is_error(response):
+            raise cls(response, message)
+
+    @classmethod
+    def parse_or_raise(cls, response, message=None) -> Dict:
+        cls.raise_if_error(response, message)
+        return response.json()
+
+
+def fix_date(date_string: str) -> str:
     date = date_string
     index = date.find('Z')
     date = date[:index] + '.0' + date[index:]
 
     return date
 
-def pretty_date(date_string):
+
+def pretty_date(date_string: str) -> str:
     return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ').strftime('%b %-d, %Y')
 
-def download_file(url, download_path, query=None):
-    r = requests.get(url, params=query, stream=True)
+
+def download_zoom_file(client: ZoomClient, url: str, download_directory: str) -> str:
+    # Note the token info in the client isn't really *public*, but it's
+    # not explicitly private, either. Use `config[]` syntax instead of
+    # `config.get()` so we get an exception if things have changed and
+    # this data is no longer available.
+    r = requests.get(url, stream=True, headers={
+        'Authorization': f'Bearer {client.config['token']}'
+    })
     r.raise_for_status()
     resolved_url = r.url
     filename = urlparse(resolved_url).path.split('/')[-1]
-    filepath = os.path.join(download_path, filename)
+    filepath = os.path.join(download_directory, filename)
     if os.path.exists(filepath):
         r.close()
         return
     with open(filepath, 'wb') as f:
         for chunk in r.iter_content(chunk_size=1024):
-            if chunk: # filter out keep-alive new chunks
+            if chunk:  # filter out keep-alive new chunks
                 f.write(chunk)
 
     return filepath
 
+
+def meeting_had_no_participants(client: ZoomClient, meeting: Dict) -> bool:
+    participants = ZoomError.parse_or_raise(client.past_meeting.get_participants(meeting_id=meeting['uuid']))['participants']
+
+    return all(
+        any(p.search(u['name']) for p in ZOOM_IGNORE_USER_NAMES)
+        for u in participants
+    )
+
+
+def video_has_audio(file_path: str) -> bool:
+    """Detect whether a video file has a non-silent audio track."""
+    result = subprocess.run([
+        'ffmpeg',
+        '-i', file_path,
+        # The `ebur128=peak` looks for the peak loudness level of the audio.
+        '-af', 'ebur128=peak=true',
+        '-f', 'null',
+        '-'
+    ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    # No audio track.
+    if b'audio:0kib' in result.stdout.lower():
+        return False
+
+    # Selent audio. Note that this won't handle things like the low hiss of an
+    # empty room, which will report some low decibel level instead of `-inf`.
+    # In practice, this covers Zoom recordings where a mic was never turned on.
+    # Docs: https://ffmpeg.org/ffmpeg-filters.html#ebur128-1
+    if re.search(rb'Peak:\s+-inf', result.stdout):
+        return False
+
+    return True
+
+
 def main():
     if DRY_RUN:
         print('⚠️ This is a dry run! Videos will not actually be uploaded.\n')
@@ -93,96 +173,108 @@ def main():
         print('Please use `python scripts/auth.py` to re-authorize.')
         return sys.exit(1)
 
+    zoom = ZoomClient(ZOOM_CLIENT_ID, ZOOM_CLIENT_SECRET, ZOOM_ACCOUNT_ID)
+
+    # Official meeting recordings we will upload belong to the account owner.
+    zoom_user_id = zoom.user.list(role_id=ZOOM_ROLES['owner']).json()['users'][0]['id']
+
     with tempfile.TemporaryDirectory() as tmpdirname:
-        print('Creating tmp dir: ' + tmpdirname)
-        meetings = client.recording.list(user_id=user_id).json()['meetings']
+        print(f'Creating tmp dir: {tmpdirname}\n')
+
+        meetings = ZoomError.parse_or_raise(zoom.recording.list(user_id=zoom_user_id))['meetings']
         meetings = sorted(meetings, key=lambda m: m['start_time'])
         # Filter recordings less than 1 minute
         meetings = filter(lambda m: m['duration'] > 1, meetings)
         for meeting in meetings:
-            print(f'Processing meeting: {meeting["topic"]} from {meeting["start_time"]}')
+            print(f'Processing meeting: {meeting["topic"]} from {meeting["start_time"]} (ID: "{meeting['uuid']}")')
+
             # 3. filter by criteria (no-op for now)
             if meeting['topic'] not in MEETINGS_TO_RECORD and DO_FILTER:
-                print('  Skipping...')
+                print('  Skipping: meeting not in topic list.')
+                continue
+
+            if meeting_had_no_participants(zoom, meeting):
+                print('  Deleting recording: nobody attended this meeting.')
+                if not DRY_RUN:
+                    response = zoom.recording.delete(meeting_id=meeting['uuid'], action='trash')
+                    if response.status_code < 300:
+                        print('  🗑️ Deleted recording.')
+                    else:
+                        print(f'  ❌ {ZoomError(response)}')
                 continue
 
             videos = [file for file in meeting['recording_files']
                       if file['file_type'].lower() == 'mp4']
 
             if len(videos) == 0:
-                print(f'  No videos to upload: {meeting["topic"]}')
+                print('  🔹 Skipping: no videos for meeting')
                 continue
             elif any((file['file_size'] == 0 for file in videos)):
-                print(f'  Meeting still processing: {meeting["topic"]}')
+                print('  🔹 Skipping: meeting still processing')
                 continue
 
-            print('  Recording is permitted for upload!')
+            print(f'  {len(videos)} videos to upload...')
             for file in videos:
                 url = file['download_url']
-                print(f'  Download from {url}...')
-                # Note the token info in the client isn't really *public*, but it's
-                # not explicitly private, either. Use `config[]` syntax instead of
-                # `config.get()` so we get an exception if things have changed and
-                # this data is no longer available.
-                filepath = download_file(url,
-                                         tmpdirname,
-                                         query={"access_token": client.config["token"]})
-
-                recording_date = fix_date(meeting['start_time'])
-                title = f'{meeting["topic"]} - {pretty_date(meeting["start_time"])}'
-
-                print(f'  Uploading {filepath}\n    {title=}\n    {recording_date=}')
-                if not DRY_RUN:
-                    video_id = upload_video(youtube,
-                                            filepath,
-                                            title=title,
-                                            category=VIDEO_CATEGORY_IDS["Science & Technology"],
-                                            license=DEFAULT_VIDEO_LICENSE,
-                                            recording_date=recording_date,
-                                            privacy_status='unlisted')
-
-                # Add all videos to default playlist
-                print('  Adding to main playlist: Uploads from Zoom')
-                if not DRY_RUN:
-                    add_video_to_playlist(youtube, video_id, title=DEFAULT_YOUTUBE_PLAYLIST, privacy='unlisted')
+                print(f'    Download from {url}...')
+                filepath = download_zoom_file(zoom, url, tmpdirname)
 
-                # Add to additional playlists
-                playlist_name = ''
-                if any(x in meeting['topic'].lower() for x in ['web mon', 'website monitoring', 'wm']):
-                    playlist_name = 'Website Monitoring'
+                if video_has_audio(filepath):
+                    recording_date = fix_date(meeting['start_time'])
+                    title = f'{meeting["topic"]} - {pretty_date(meeting["start_time"])}'
 
-                if 'data together' in meeting['topic'].lower():
-                    playlist_name = 'Data Together'
+                    print(f'    Uploading {filepath}\n      {title=}\n      {recording_date=}')
+                    if not DRY_RUN:
+                        video_id = upload_video(youtube,
+                                                filepath,
+                                                title=title,
+                                                category=VIDEO_CATEGORY_IDS["Science & Technology"],
+                                                license=DEFAULT_VIDEO_LICENSE,
+                                                recording_date=recording_date,
+                                                privacy_status='unlisted')
 
-                if 'community call' in meeting['topic'].lower():
-                    playlist_name = 'Community Calls'
+                    # Add all videos to default playlist
+                    print('    Adding to main playlist: Uploads from Zoom')
+                    if not DRY_RUN:
+                        add_video_to_playlist(youtube, video_id, title=DEFAULT_YOUTUBE_PLAYLIST, privacy='unlisted')
 
-                if 'edgi introductions' in meeting['topic'].lower():
-                    playlist_name = 'EDGI Introductions'
+                    # Add to additional playlists
+                    playlist_name = ''
+                    if any(x in meeting['topic'].lower() for x in ['web mon', 'website monitoring', 'wm']):
+                        playlist_name = 'Website Monitoring'
 
-                if 'all-edgi' in meeting['topic'].lower():
-                    playlist_name = 'All-EDGI Meetings'
+                    if 'data together' in meeting['topic'].lower():
+                        playlist_name = 'Data Together'
 
-                if playlist_name:
-                    print(f'  Adding to call playlist: {playlist_name}')
-                    if not DRY_RUN:
-                        add_video_to_playlist(youtube, video_id, title=playlist_name, privacy='unlisted')
+                    if 'community call' in meeting['topic'].lower():
+                        playlist_name = 'Community Calls'
+
+                    if 'edgi introductions' in meeting['topic'].lower():
+                        playlist_name = 'EDGI Introductions'
+
+                    if 'all-edgi' in meeting['topic'].lower():
+                        playlist_name = 'All-EDGI Meetings'
+
+                    if playlist_name:
+                        print(f'    Adding to call playlist: {playlist_name}')
+                        if not DRY_RUN:
+                            add_video_to_playlist(youtube, video_id, title=playlist_name, privacy='unlisted')
+
+                    # TODO: save the chat log transcript in a comment on the video.
+                else:
+                    print('    Skipping upload: video was silent (no mics were on).')
 
                 if ZOOM_DELETE_AFTER_UPLOAD and not DRY_RUN:
                     # Just delete the video for now, since that takes the most storage space.
-                    # We should save the chat log transcript in a comment on the video.
-
-                    # We're using the zoom api directly instead of zoomus, because zoomus only implements
-                    # deleting all recorded files related to the meeting using the v2 API,
-                    # while we still want to retain the audio and chat files for backup.
-                    url = f'https://api.zoom.us/v2/meetings/{file["meeting_id"]}/recordings/{file["id"]}'
-                    querystring = {"action":"trash"}
-                    headers = {'authorization': f'Bearer {client.config["token"]}'}
-                    response = requests.request("DELETE", url, headers=headers, params=querystring)
+                    response = zoom.recording.delete_single_recording(
+                        meeting_id=file['meeting_id'],
+                        recording_id=file['id'],
+                        action='trash'
+                    )
                     if response.status_code == 204:
-                        print(f'  Deleted {file["file_type"]} file from Zoom for recording: {meeting["topic"]}')
+                        print(f'  🗑️ Deleted {file["file_type"]} file from Zoom for recording: {meeting["topic"]}')
                     else:
-                        print(f'  The file could not be deleted. We received this response: {response.status_code}. Please check https://marketplace.zoom.us/docs/api-reference/zoom-api/cloud-recording/recordingdeleteone for what that could mean.')
+                        print(f'  ❌ {ZoomError(response)}')
 
 
 if __name__ == '__main__':