From 5921220bc680f71245a7acfa00052672d47ea459 Mon Sep 17 00:00:00 2001 From: Ankur Duggal <38927181+ankykong@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:10:23 -0700 Subject: [PATCH 1/4] Mute During Speech Implementation --- src/pipecat/services/deepgram.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index bbe5d300..7f513b1a 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -18,7 +18,9 @@ MetricsFrame, StartFrame, SystemFrame, - TranscriptionFrame) + TranscriptionFrame, + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame,) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AsyncAIService, TTSService from pipecat.utils.time import time_now_iso8601 @@ -122,6 +124,7 @@ def __init__(self, interim_results=True, smart_format=True, ), + mute_during_speech=False, **kwargs): super().__init__(**kwargs) @@ -131,14 +134,28 @@ def __init__(self, api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"})) self._connection = self._client.listen.asynclive.v("1") self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message) + self.mute_during_speech = mute_during_speech + self.bot_speaking = False async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) + # print(f"Is Frame BotStartedSpeakingFrame: {isinstance(Frame, BotStartedSpeakingFrame)}") + # print(f"Frame: {frame}") + if isinstance(frame, BotStartedSpeakingFrame): + print("Bot Speaking") + self.bot_speaking = True + elif isinstance(frame, BotStoppedSpeakingFrame): + print("Bot Stopped Speaking") + self.bot_speaking = False if isinstance(frame, SystemFrame): await self.push_frame(frame, direction) elif isinstance(frame, AudioRawFrame): - await self._connection.send(frame.audio) + # print(f"AUDIO RAW FRAME: {frame}") + if not (self.mute_during_speech and self.bot_speaking): + await self._connection.send(frame.audio) + else: + print("Bot Speaking") else: await self.queue_frame(frame, direction) @@ -158,6 +175,7 @@ async def cancel(self, frame: CancelFrame): await self._connection.finish() async def _on_message(self, *args, **kwargs): + # print(f"ON MESSAGE: {args}, {kwargs}") result = kwargs["result"] is_final = result.is_final transcript = result.channel.alternatives[0].transcript From fa188e5e33262ffcce37c57ba4d897982b168931 Mon Sep 17 00:00:00 2001 From: Ankur Duggal <38927181+ankykong@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:27:54 -0400 Subject: [PATCH 2/4] Changelog Update --- CHANGELOG.md | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f285bc01..0ac451b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- `AudioRawFrame`s are not pushed downstream from the base output +### Changed + += `DeepgramSTTService` now has a mute during speech functionality. When the bot is +speeking, the user's audio is not transcribed and is thereby essentially muted. + +### Fixed + +### Other + +## [0.0.40] - 2024-08-20 + +### Added + +- VAD parameters can now be dynamicallt updated using the + `VADParamsUpdateFrame`. + +- `ErrorFrame` has now a `fatal` field to indicate the bot should exit if a + fatal error is pushed upstream (false by default). A new `FatalErrorFrame` + that sets this flag to true has been added. + +- `AnthropicLLMService` now supports function calling and initial support for + prompt caching. + (see https://www.anthropic.com/news/prompt-caching) + +- `ElevenLabsTTSService` can now specify ElevenLabs input parameters such as + `output_format`. + +- `TwilioFrameSerializer` can now specify Twilio's and Pipecat's desired sample + rates to use. + +- Added new `on_participant_updated` event to `DailyTransport`. + +- Added `DailyRESTHelper.delete_room_by_name()` and + `DailyRESTHelper.delete_room_by_url()`. + +- Added LLM and TTS usage metrics. Those are enabled when + `PipelineParams.enable_usage_metrics` is True. + +- `AudioRawFrame`s are now pushed downstream from the base output transport. This allows capturing the exact words the bot says by adding an STT service at the end of the pipeline. @@ -28,6 +66,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Support RTVI message protocol 0.1. This includes new messages, support for + messages responses, support for actions, configuration, webhooks and a bunch + of new cool stuff. + (see https://docs.rtvi.ai/) + +- `SileroVAD` dependency is now imported via pip's `silero-vad` package. + +- `ElevenLabsTTSService` now uses `eleven_turbo_v2_5` model by default. + - `BotSpeakingFrame` is now a control frame. - `StartFrame` is now a control frame similar to `EndFrame`. @@ -37,6 +84,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- `TTSStartFrame` and `TTSStopFrame` are now sent when TTS really starts and + stops. This allows for knowing when the bot starts and stops speaking even + with asynchronous services (like Cartesia). + +- Fixed `AzureSTTService` transcription frame timestamps. + +- Fixed an issue with `DailyRESTHelper.create_room()` expirations which would + cause this function to stop working after the initial expiration elapsed. + - Improved `EndFrame` and `CancelFrame` handling. `EndFrame` should end things gracefully while a `CancelFrame` should cancel all running tasks as soon as possible. @@ -55,6 +111,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Other +- Added `studypal` example (from to the Cartesia folks!). + +- Most examples now use Cartesia. + +- Added examples `foundational/19a-tools-anthropic.py`, + `foundational/19b-tools-video-anthropic.py` and + `foundational/19a-tools-togetherai.py`. + - Added examples `foundational/18-gstreamer-filesrc.py` and `foundational/18a-gstreamer-videotestsrc.py` that show how to use `GStreamerPipelineSource` @@ -830,4 +894,4 @@ a bit. ## [0.0.2] - 2024-03-12 -Initial public release. +Initial public release. \ No newline at end of file From 01361951bbc05176ff46443c3343539fd74753bb Mon Sep 17 00:00:00 2001 From: Ankur Duggal <38927181+ankykong@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:33:43 -0400 Subject: [PATCH 3/4] Fixing Typo --- src/pipecat/services/deepgram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 934afb87..09474792 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -18,7 +18,7 @@ StartFrame, SystemFrame, BotStartedSpeakingFrame, - BotStoppedSpeakingFrame,) + BotStoppedSpeakingFrame, TTSStartedFrame, TTSStoppedFrame, TranscriptionFrame) From 634e1da45b470241df7c0905a45e235669cb0b56 Mon Sep 17 00:00:00 2001 From: adburpitt Date: Wed, 21 Aug 2024 12:09:24 -0700 Subject: [PATCH 4/4] format imports --- src/pipecat/services/deepgram.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index 09474792..c18d9939 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -4,12 +4,15 @@ # SPDX-License-Identifier: BSD 2-Clause License # -import aiohttp - from typing import AsyncGenerator +import aiohttp +from loguru import logger + from pipecat.frames.frames import ( AudioRawFrame, + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, CancelFrame, EndFrame, ErrorFrame, @@ -17,25 +20,21 @@ InterimTranscriptionFrame, StartFrame, SystemFrame, - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, + TranscriptionFrame, TTSStartedFrame, TTSStoppedFrame, - TranscriptionFrame) +) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.ai_services import AsyncAIService, TTSService from pipecat.utils.time import time_now_iso8601 -from loguru import logger - - # See .env.example for Deepgram configuration needed try: from deepgram import ( DeepgramClient, DeepgramClientOptions, - LiveTranscriptionEvents, LiveOptions, + LiveTranscriptionEvents, ) except ModuleNotFoundError as e: logger.error(f"Exception: {e}")