Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,23 @@
See https://docs.livekit.io/agents/integrations/stt/deepgram/ for more information.
"""

from .models import DeepgramLanguages, DeepgramModels, TTSModels
from .stt import STT, SpeechStream
from .stt_v2 import SpeechStreamv2, STTv2
from .tts import TTS
from .version import __version__

__all__ = ["STT", "SpeechStream", "STTv2", "SpeechStreamv2", "__version__", "TTS"]
__all__ = [
"STT",
"SpeechStream",
"STTv2",
"SpeechStreamv2",
"TTS",
"DeepgramModels",
"DeepgramLanguages",
"TTSModels",
"__version__",
]


from livekit.agents import Plugin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def _to_deepgram_url(opts: dict, base_url: str, *, websocket: bool) -> str:
opts["keywords"] = [
f"{keyword}:{intensifier}" for (keyword, intensifier) in opts["keywords"]
]
if opts.get("replace"):
# convert replace dict to a list of "term:replacement"
# https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-replace
opts["replace"] = [f"{term}:{replacement}" for term, replacement in opts["replace"].items()]

# lowercase bools
opts = {k: str(v).lower() if isinstance(v, bool) else v for k, v in opts.items()}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
"nova-2-medical",
"nova-2-drivethru",
"nova-2-automotive",
"nova-2-atc",
"nova-3",
"nova-3-general",
"nova-3-medical",
"nova-3-multilingual",
"enhanced-general",
"enhanced-meeting",
"enhanced-phonecall",
Expand All @@ -33,10 +35,12 @@
"whisper-small",
"whisper-medium",
"whisper-large",
"flux-general-en",
]

V2Models = Literal["flux-general-en"]

# https://developers.deepgram.com/docs/models-languages-overview
DeepgramLanguages = Literal[
"zh",
"zh-CN",
Expand All @@ -54,30 +58,87 @@
"de",
"hi",
"hi-Latn",
"pt",
"pt-BR",
"es",
"es-419",
"hi",
"hi-Latn",
"id",
"it",
"ja",
"ko",
"no",
"pl",
"pt",
"pt-BR",
"ru",
"es",
"es-419",
"es-LATAM",
"sv",
"ta",
"taq",
"uk",
"tr",
"sv",
"id",
"pt",
"pt-BR",
"ru",
"th",
"tr",
"uk",
"multi",
]

# https://developers.deepgram.com/docs/tts-models
TTSModels = Literal[
# Aura-2 English
"aura-2-andromeda-en",
"aura-2-apollo-en",
"aura-2-arcas-en",
"aura-2-aries-en",
"aura-2-artemis-en",
"aura-2-asteria-en",
"aura-2-atlas-en",
"aura-2-aurora-en",
"aura-2-callisto-en",
"aura-2-cetus-en",
"aura-2-chiron-en",
"aura-2-columbia-en",
"aura-2-cordelia-en",
"aura-2-crina-en",
"aura-2-draco-en",
"aura-2-electra-en",
"aura-2-eos-en",
"aura-2-harmonia-en",
"aura-2-helios-en",
"aura-2-hera-en",
"aura-2-hermes-en",
"aura-2-hyperion-en",
"aura-2-io-en",
"aura-2-iris-en",
"aura-2-janus-en",
"aura-2-juno-en",
"aura-2-jupiter-en",
"aura-2-luna-en",
"aura-2-mars-en",
"aura-2-minerva-en",
"aura-2-mira-en",
"aura-2-neptune-en",
"aura-2-odysseus-en",
"aura-2-ophiuchus-en",
"aura-2-orion-en",
"aura-2-orpheus-en",
"aura-2-phoebe-en",
"aura-2-pluto-en",
"aura-2-saturn-en",
"aura-2-selene-en",
"aura-2-theia-en",
"aura-2-titan-en",
"aura-2-triton-en",
"aura-2-vega-en",
"aura-2-venus-en",
"aura-2-zeus-en",
# Aura-1 English (legacy)
"aura-asteria-en",
"aura-luna-en",
"aura-stella-en",
"aura-athena-en",
"aura-hera-en",
"aura-orion-en",
"aura-arcas-en",
"aura-perseus-en",
"aura-angus-en",
"aura-orpheus-en",
"aura-helios-en",
"aura-zeus-en",
]
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ class STTOptions:
numerals: bool = False
mip_opt_out: bool = False
tags: NotGivenOr[list[str]] = NOT_GIVEN
utterance_end_ms: int | None = None
dictation: bool = False
redact: list[str] | None = None
replace: dict[str, str] | None = None
search: list[str] | None = None


class STT(stt.STT):
Expand Down Expand Up @@ -100,6 +105,11 @@ def __init__(
numerals: bool = False,
mip_opt_out: bool = False,
vad_events: bool = True,
utterance_end_ms: int | None = None,
dictation: bool = False,
redact: list[str] | None = None,
replace: dict[str, str] | None = None,
search: list[str] | None = None,
# deprecated
keyterms: NotGivenOr[list[str]] = NOT_GIVEN,
) -> None:
Expand Down Expand Up @@ -130,6 +140,21 @@ def __init__(
mip_opt_out: Whether to take part in the model improvement program
vad_events: Whether to enable VAD (Voice Activity Detection) events.
When enabled, SpeechStarted events are sent when speech is detected. Defaults to True.
utterance_end_ms: Duration of silence in milliseconds to detect the end of an utterance
and emit an UtteranceEnd event. Requires interim_results=True.
See https://developers.deepgram.com/docs/understand-endpointing-interim-results
dictation: Whether to enable dictation mode which converts spoken punctuation commands
(e.g. "comma", "period") into punctuation marks. Defaults to False.
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-dictation
redact: List of sensitive information types to redact from the transcript
(e.g. ["pci", "pii", "numbers", "ssn"]).
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-redact
replace: Dictionary of terms to replace in the transcript, where keys are the original
terms and values are the replacements (e.g. {"hello": "hi"}).
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-replace
search: List of terms to search for in the transcript. Matched terms are returned with
confidence scores in the response.
See https://developers.deepgram.com/reference/speech-to-text/listen-streaming#query-search

Raises:
ValueError: If no API key is provided or found in environment variables.
Expand Down Expand Up @@ -185,6 +210,11 @@ def __init__(
vad_events=vad_events,
tags=_validate_tags(tags) if is_given(tags) else [],
endpoint_url=base_url,
utterance_end_ms=utterance_end_ms,
dictation=dictation,
redact=redact,
replace=replace,
search=search,
)
self._session = http_session
self._streams = weakref.WeakSet[SpeechStream]()
Expand Down Expand Up @@ -298,6 +328,11 @@ def update_options(
vad_events: NotGivenOr[bool] = NOT_GIVEN,
tags: NotGivenOr[list[str]] = NOT_GIVEN,
endpoint_url: NotGivenOr[str] = NOT_GIVEN,
utterance_end_ms: NotGivenOr[int | None] = NOT_GIVEN,
dictation: NotGivenOr[bool] = NOT_GIVEN,
redact: NotGivenOr[list[str] | None] = NOT_GIVEN,
replace: NotGivenOr[dict[str, str] | None] = NOT_GIVEN,
search: NotGivenOr[list[str] | None] = NOT_GIVEN,
# deprecated
keyterms: NotGivenOr[list[str]] = NOT_GIVEN,
) -> None:
Expand Down Expand Up @@ -342,6 +377,16 @@ def update_options(
self._opts.tags = _validate_tags(tags)
if is_given(endpoint_url):
self._opts.endpoint_url = endpoint_url
if is_given(utterance_end_ms):
self._opts.utterance_end_ms = utterance_end_ms
if is_given(dictation):
self._opts.dictation = dictation
if is_given(redact):
self._opts.redact = redact
if is_given(replace):
self._opts.replace = replace
if is_given(search):
self._opts.search = search

for stream in self._streams:
stream.update_options(
Expand All @@ -361,6 +406,11 @@ def update_options(
mip_opt_out=mip_opt_out,
vad_events=vad_events,
endpoint_url=endpoint_url,
utterance_end_ms=utterance_end_ms,
dictation=dictation,
redact=redact,
replace=replace,
search=search,
)

def _sanitize_options(
Expand Down Expand Up @@ -432,6 +482,11 @@ def update_options(
vad_events: NotGivenOr[bool] = NOT_GIVEN,
tags: NotGivenOr[list[str]] = NOT_GIVEN,
endpoint_url: NotGivenOr[str] = NOT_GIVEN,
utterance_end_ms: NotGivenOr[int | None] = NOT_GIVEN,
dictation: NotGivenOr[bool] = NOT_GIVEN,
redact: NotGivenOr[list[str] | None] = NOT_GIVEN,
replace: NotGivenOr[dict[str, str] | None] = NOT_GIVEN,
search: NotGivenOr[list[str] | None] = NOT_GIVEN,
# deprecated
keyterms: NotGivenOr[list[str]] = NOT_GIVEN,
) -> None:
Expand Down Expand Up @@ -476,6 +531,16 @@ def update_options(
self._opts.tags = _validate_tags(tags)
if is_given(endpoint_url):
self._opts.endpoint_url = endpoint_url
if is_given(utterance_end_ms):
self._opts.utterance_end_ms = utterance_end_ms
if is_given(dictation):
self._opts.dictation = dictation
if is_given(redact):
self._opts.redact = redact
if is_given(replace):
self._opts.replace = replace
if is_given(search):
self._opts.search = search

self._reconnect_event.set()

Expand Down Expand Up @@ -617,6 +682,16 @@ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
live_config["keywords"] = self._opts.keywords
if self._opts.keyterm:
live_config["keyterm"] = self._opts.keyterm
if self._opts.utterance_end_ms is not None:
live_config["utterance_end_ms"] = self._opts.utterance_end_ms
if self._opts.dictation:
live_config["dictation"] = True
if self._opts.redact:
live_config["redact"] = self._opts.redact
if self._opts.replace:
live_config["replace"] = self._opts.replace
if self._opts.search:
live_config["search"] = self._opts.search

if self._opts.language:
live_config["language"] = self._opts.language
Expand Down Expand Up @@ -716,6 +791,12 @@ def _process_stream_event(self, data: dict) -> None:
self._speaking = False
self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))

elif data["type"] == "UtteranceEnd":
# Fired when utterance_end_ms is set and the configured silence duration has elapsed.
# https://developers.deepgram.com/docs/understand-endpointing-interim-results
if self._speaking:
self._speaking = False
self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
elif data["type"] == "Metadata":
pass # metadata is too noisy
else:
Expand Down
Loading
Loading