Skip to content

Commit c7018fc

Browse files
committed
fix(deepgram): sync missing STT/TTS API params, fix duplicate languages, add TTS model list
**models.py** - Add `nova-2-atc`, `nova-3-multilingual` to `DeepgramModels` Ref: https://developers.deepgram.com/docs/models-languages-overview - Promote `flux-general-en` into `DeepgramModels` (was only in `V2Models`) - Remove duplicate entries in `DeepgramLanguages` (hi, pt, pt-BR, sv each appeared multiple times due to copy-paste artifact) - Add `TTSModels` Literal with all Aura-2 and Aura-1 voice names Ref: https://developers.deepgram.com/docs/tts-models **stt.py** - Add `utterance_end_ms` (int | None): silence duration to emit UtteranceEnd; requires interim_results=True Ref: https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-utterance_end_ms - Add `dictation` (bool): converts spoken punctuation commands into marks Ref: https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-dictation - Add `redact` (list[str]): redact PCI/PII/SSN from transcripts Ref: https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-redact - Add `replace` (dict[str, str]): term replacement in transcripts Ref: https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-replace - Add `search` (list[str]): highlight search terms with confidence scores Ref: https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-search - Handle `UtteranceEnd` event in `_process_stream_event` to emit END_OF_SPEECH **tts.py** - Add `bit_rate` (int | None): for compressed encodings (e.g. mp3) Ref: https://developers.deepgram.com/reference/text-to-speech-api#query-bit_rate - Use `TTSModels | str` as model type - Expand `update_options` to include `encoding`, `sample_rate`, `bit_rate` **_utils.py** - Handle `replace` dict in `_to_deepgram_url` by encoding as "term:replacement" pairs, consistent with how `keywords` are encoded
1 parent ed82dbd commit c7018fc

5 files changed

Lines changed: 209 additions & 31 deletions

File tree

livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,23 @@
1919
See https://docs.livekit.io/agents/integrations/stt/deepgram/ for more information.
2020
"""
2121

22+
from .models import DeepgramModels, DeepgramLanguages, TTSModels
2223
from .stt import STT, SpeechStream
2324
from .stt_v2 import SpeechStreamv2, STTv2
2425
from .tts import TTS
2526
from .version import __version__
2627

27-
__all__ = ["STT", "SpeechStream", "STTv2", "SpeechStreamv2", "__version__", "TTS"]
28+
__all__ = [
29+
"STT",
30+
"SpeechStream",
31+
"STTv2",
32+
"SpeechStreamv2",
33+
"TTS",
34+
"DeepgramModels",
35+
"DeepgramLanguages",
36+
"TTSModels",
37+
"__version__",
38+
]
2839

2940

3041
from livekit.agents import Plugin

livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ def _to_deepgram_url(opts: dict, base_url: str, *, websocket: bool) -> str:
4646
opts["keywords"] = [
4747
f"{keyword}:{intensifier}" for (keyword, intensifier) in opts["keywords"]
4848
]
49+
if opts.get("replace"):
50+
# convert replace dict to a list of "term:replacement"
51+
# https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-replace
52+
opts["replace"] = [f"{term}:{replacement}" for term, replacement in opts["replace"].items()]
4953

5054
# lowercase bools
5155
opts = {k: str(v).lower() if isinstance(v, bool) else v for k, v in opts.items()}

livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py

Lines changed: 74 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
"nova-2-medical",
1515
"nova-2-drivethru",
1616
"nova-2-automotive",
17+
"nova-2-atc",
1718
"nova-3",
1819
"nova-3-general",
1920
"nova-3-medical",
21+
"nova-3-multilingual",
2022
"enhanced-general",
2123
"enhanced-meeting",
2224
"enhanced-phonecall",
@@ -33,10 +35,12 @@
3335
"whisper-small",
3436
"whisper-medium",
3537
"whisper-large",
38+
"flux-general-en",
3639
]
3740

3841
V2Models = Literal["flux-general-en"]
3942

43+
# https://developers.deepgram.com/docs/models-languages-overview
4044
DeepgramLanguages = Literal[
4145
"zh",
4246
"zh-CN",
@@ -54,30 +58,87 @@
5458
"de",
5559
"hi",
5660
"hi-Latn",
57-
"pt",
58-
"pt-BR",
59-
"es",
60-
"es-419",
61-
"hi",
62-
"hi-Latn",
61+
"id",
6362
"it",
6463
"ja",
6564
"ko",
6665
"no",
6766
"pl",
6867
"pt",
6968
"pt-BR",
69+
"ru",
70+
"es",
71+
"es-419",
7072
"es-LATAM",
7173
"sv",
7274
"ta",
7375
"taq",
74-
"uk",
75-
"tr",
76-
"sv",
77-
"id",
78-
"pt",
79-
"pt-BR",
80-
"ru",
8176
"th",
77+
"tr",
78+
"uk",
8279
"multi",
8380
]
81+
82+
# https://developers.deepgram.com/docs/tts-models
83+
TTSModels = Literal[
84+
# Aura-2 English
85+
"aura-2-andromeda-en",
86+
"aura-2-apollo-en",
87+
"aura-2-arcas-en",
88+
"aura-2-aries-en",
89+
"aura-2-artemis-en",
90+
"aura-2-asteria-en",
91+
"aura-2-atlas-en",
92+
"aura-2-aurora-en",
93+
"aura-2-callisto-en",
94+
"aura-2-cetus-en",
95+
"aura-2-chiron-en",
96+
"aura-2-columbia-en",
97+
"aura-2-cordelia-en",
98+
"aura-2-crina-en",
99+
"aura-2-draco-en",
100+
"aura-2-electra-en",
101+
"aura-2-eos-en",
102+
"aura-2-harmonia-en",
103+
"aura-2-helios-en",
104+
"aura-2-hera-en",
105+
"aura-2-hermes-en",
106+
"aura-2-hyperion-en",
107+
"aura-2-io-en",
108+
"aura-2-iris-en",
109+
"aura-2-janus-en",
110+
"aura-2-juno-en",
111+
"aura-2-jupiter-en",
112+
"aura-2-luna-en",
113+
"aura-2-mars-en",
114+
"aura-2-minerva-en",
115+
"aura-2-mira-en",
116+
"aura-2-neptune-en",
117+
"aura-2-odysseus-en",
118+
"aura-2-ophiuchus-en",
119+
"aura-2-orion-en",
120+
"aura-2-orpheus-en",
121+
"aura-2-phoebe-en",
122+
"aura-2-pluto-en",
123+
"aura-2-saturn-en",
124+
"aura-2-selene-en",
125+
"aura-2-theia-en",
126+
"aura-2-titan-en",
127+
"aura-2-triton-en",
128+
"aura-2-vega-en",
129+
"aura-2-venus-en",
130+
"aura-2-zeus-en",
131+
# Aura-1 English (legacy)
132+
"aura-asteria-en",
133+
"aura-luna-en",
134+
"aura-stella-en",
135+
"aura-athena-en",
136+
"aura-hera-en",
137+
"aura-orion-en",
138+
"aura-arcas-en",
139+
"aura-perseus-en",
140+
"aura-angus-en",
141+
"aura-orpheus-en",
142+
"aura-helios-en",
143+
"aura-zeus-en",
144+
]

livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ class STTOptions:
7272
numerals: bool = False
7373
mip_opt_out: bool = False
7474
tags: NotGivenOr[list[str]] = NOT_GIVEN
75+
utterance_end_ms: int | None = None
76+
dictation: bool = False
77+
redact: list[str] | None = None
78+
replace: dict[str, str] | None = None
79+
search: list[str] | None = None
7580

7681

7782
class STT(stt.STT):
@@ -100,6 +105,11 @@ def __init__(
100105
numerals: bool = False,
101106
mip_opt_out: bool = False,
102107
vad_events: bool = True,
108+
utterance_end_ms: int | None = None,
109+
dictation: bool = False,
110+
redact: list[str] | None = None,
111+
replace: dict[str, str] | None = None,
112+
search: list[str] | None = None,
103113
# deprecated
104114
keyterms: NotGivenOr[list[str]] = NOT_GIVEN,
105115
) -> None:
@@ -130,6 +140,21 @@ def __init__(
130140
mip_opt_out: Whether to take part in the model improvement program
131141
vad_events: Whether to enable VAD (Voice Activity Detection) events.
132142
When enabled, SpeechStarted events are sent when speech is detected. Defaults to True.
143+
utterance_end_ms: Duration of silence in milliseconds to detect the end of an utterance
144+
and emit an UtteranceEnd event. Requires interim_results=True.
145+
See https://developers.deepgram.com/docs/understand-endpointing-interim-results
146+
dictation: Whether to enable dictation mode which converts spoken punctuation commands
147+
(e.g. "comma", "period") into punctuation marks. Defaults to False.
148+
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-dictation
149+
redact: List of sensitive information types to redact from the transcript
150+
(e.g. ["pci", "pii", "numbers", "ssn"]).
151+
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-redact
152+
replace: Dictionary of terms to replace in the transcript, where keys are the original
153+
terms and values are the replacements (e.g. {"hello": "hi"}).
154+
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-replace
155+
search: List of terms to search for in the transcript. Matched terms are returned with
156+
confidence scores in the response.
157+
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-search
133158
134159
Raises:
135160
ValueError: If no API key is provided or found in environment variables.
@@ -185,6 +210,11 @@ def __init__(
185210
vad_events=vad_events,
186211
tags=_validate_tags(tags) if is_given(tags) else [],
187212
endpoint_url=base_url,
213+
utterance_end_ms=utterance_end_ms,
214+
dictation=dictation,
215+
redact=redact,
216+
replace=replace,
217+
search=search,
188218
)
189219
self._session = http_session
190220
self._streams = weakref.WeakSet[SpeechStream]()
@@ -298,6 +328,11 @@ def update_options(
298328
vad_events: NotGivenOr[bool] = NOT_GIVEN,
299329
tags: NotGivenOr[list[str]] = NOT_GIVEN,
300330
endpoint_url: NotGivenOr[str] = NOT_GIVEN,
331+
utterance_end_ms: NotGivenOr[int | None] = NOT_GIVEN,
332+
dictation: NotGivenOr[bool] = NOT_GIVEN,
333+
redact: NotGivenOr[list[str] | None] = NOT_GIVEN,
334+
replace: NotGivenOr[dict[str, str] | None] = NOT_GIVEN,
335+
search: NotGivenOr[list[str] | None] = NOT_GIVEN,
301336
# deprecated
302337
keyterms: NotGivenOr[list[str]] = NOT_GIVEN,
303338
) -> None:
@@ -342,6 +377,16 @@ def update_options(
342377
self._opts.tags = _validate_tags(tags)
343378
if is_given(endpoint_url):
344379
self._opts.endpoint_url = endpoint_url
380+
if is_given(utterance_end_ms):
381+
self._opts.utterance_end_ms = utterance_end_ms
382+
if is_given(dictation):
383+
self._opts.dictation = dictation
384+
if is_given(redact):
385+
self._opts.redact = redact
386+
if is_given(replace):
387+
self._opts.replace = replace
388+
if is_given(search):
389+
self._opts.search = search
345390

346391
for stream in self._streams:
347392
stream.update_options(
@@ -361,6 +406,11 @@ def update_options(
361406
mip_opt_out=mip_opt_out,
362407
vad_events=vad_events,
363408
endpoint_url=endpoint_url,
409+
utterance_end_ms=utterance_end_ms,
410+
dictation=dictation,
411+
redact=redact,
412+
replace=replace,
413+
search=search,
364414
)
365415

366416
def _sanitize_options(
@@ -432,6 +482,11 @@ def update_options(
432482
vad_events: NotGivenOr[bool] = NOT_GIVEN,
433483
tags: NotGivenOr[list[str]] = NOT_GIVEN,
434484
endpoint_url: NotGivenOr[str] = NOT_GIVEN,
485+
utterance_end_ms: NotGivenOr[int | None] = NOT_GIVEN,
486+
dictation: NotGivenOr[bool] = NOT_GIVEN,
487+
redact: NotGivenOr[list[str] | None] = NOT_GIVEN,
488+
replace: NotGivenOr[dict[str, str] | None] = NOT_GIVEN,
489+
search: NotGivenOr[list[str] | None] = NOT_GIVEN,
435490
# deprecated
436491
keyterms: NotGivenOr[list[str]] = NOT_GIVEN,
437492
) -> None:
@@ -476,6 +531,16 @@ def update_options(
476531
self._opts.tags = _validate_tags(tags)
477532
if is_given(endpoint_url):
478533
self._opts.endpoint_url = endpoint_url
534+
if is_given(utterance_end_ms):
535+
self._opts.utterance_end_ms = utterance_end_ms
536+
if is_given(dictation):
537+
self._opts.dictation = dictation
538+
if is_given(redact):
539+
self._opts.redact = redact
540+
if is_given(replace):
541+
self._opts.replace = replace
542+
if is_given(search):
543+
self._opts.search = search
479544

480545
self._reconnect_event.set()
481546

@@ -617,6 +682,16 @@ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
617682
live_config["keywords"] = self._opts.keywords
618683
if self._opts.keyterm:
619684
live_config["keyterm"] = self._opts.keyterm
685+
if self._opts.utterance_end_ms is not None:
686+
live_config["utterance_end_ms"] = self._opts.utterance_end_ms
687+
if self._opts.dictation:
688+
live_config["dictation"] = True
689+
if self._opts.redact:
690+
live_config["redact"] = self._opts.redact
691+
if self._opts.replace:
692+
live_config["replace"] = self._opts.replace
693+
if self._opts.search:
694+
live_config["search"] = self._opts.search
620695

621696
if self._opts.language:
622697
live_config["language"] = self._opts.language
@@ -716,6 +791,12 @@ def _process_stream_event(self, data: dict) -> None:
716791
self._speaking = False
717792
self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
718793

794+
elif data["type"] == "UtteranceEnd":
795+
# Fired when utterance_end_ms is set and the configured silence duration has elapsed.
796+
# https://developers.deepgram.com/docs/understand-endpointing-interim-results
797+
if self._speaking:
798+
self._speaking = False
799+
self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
719800
elif data["type"] == "Metadata":
720801
pass # metadata is too noisy
721802
else:

0 commit comments

Comments
 (0)