Merge pull request #67 from JigsawStack/feat/stt-language

winzamark123 · web-flow · commit e7cbf4d3194a · 2025-09-21T18:24:10.000-07:00
diff --git a/jigsawstack/audio.py b/jigsawstack/audio.py
@@ -10,13 +10,44 @@
 
 class SpeechToTextParams(TypedDict):
     url: NotRequired[str]
+    """
+    the url of the audio file to transcribe, optional if file_store_key is provided
+    """
+
     file_store_key: NotRequired[str]
+    """
+    the file store key of the audio file to transcribe, optional if url is provided
+    """
+
     language: NotRequired[Union[str, Literal["auto"]]]
+    """
+    The language to transcribe or translate the file into. Use “auto” for automatic language detection, or specify a language code. If not specified, defaults to automatic detection. All supported language codes can be found
+    """
+
     translate: NotRequired[bool]
+    """
+    When set to true, translates the content into English (or the specified language if language parameter is provided)
+    """
+
     by_speaker: NotRequired[bool]
+    """
+    Identifies and separates different speakers in the audio file. When enabled, the response will include a speakers array with speaker-segmented transcripts.
+    """
+
     webhook_url: NotRequired[str]
+    """
+    Webhook URL to send result to. When provided, the API will process asynchronously and send results to this URL when completed.
+    """
+
     batch_size: NotRequired[int]
+    """
+    The batch size to return. Maximum value is 40. This controls how the audio is chunked for processing.
+    """
+
     chunk_duration: NotRequired[int]
+    """
+    the duration of each chunk in seconds, maximum value is 15, defaults to 3
+    """
 
 
 class ChunkParams(TypedDict):
@@ -32,8 +63,29 @@ class BySpeakerParams(ChunkParams):
 
 class SpeechToTextResponse(BaseResponse):
     text: str
+    """
+    the text of the transcription
+    """
+
     chunks: List[ChunkParams]
+    """
+    the chunks of the transcription
+    """
+
     speakers: Optional[List[BySpeakerParams]]
+    """
+    the speakers of the transcription, available if by_speaker is set to true
+    """
+
+    language_detected: Optional[str]
+    """
+    the language detected in the transcription, available if language is set to auto
+    """
+
+    confidence: Optional[float]
+    """
+    the confidence of the transcription language detection, available if language is set to auto
+    """
 
 
 class SpeechToTextWebhookResponse(BaseResponse):