Skip to content

Commit e7cbf4d

Browse files
authored
Merge pull request #67 from JigsawStack/feat/stt-language
2 parents b72cb12 + 0d71a8d commit e7cbf4d

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

jigsawstack/audio.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,44 @@
1010

1111
class SpeechToTextParams(TypedDict):
1212
url: NotRequired[str]
13+
"""
14+
the url of the audio file to transcribe, optional if file_store_key is provided
15+
"""
16+
1317
file_store_key: NotRequired[str]
18+
"""
19+
the file store key of the audio file to transcribe, optional if url is provided
20+
"""
21+
1422
language: NotRequired[Union[str, Literal["auto"]]]
23+
"""
24+
The language to transcribe or translate the file into. Use “auto” for automatic language detection, or specify a language code. If not specified, defaults to automatic detection. All supported language codes can be found
25+
"""
26+
1527
translate: NotRequired[bool]
28+
"""
29+
When set to true, translates the content into English (or the specified language if language parameter is provided)
30+
"""
31+
1632
by_speaker: NotRequired[bool]
33+
"""
34+
Identifies and separates different speakers in the audio file. When enabled, the response will include a speakers array with speaker-segmented transcripts.
35+
"""
36+
1737
webhook_url: NotRequired[str]
38+
"""
39+
Webhook URL to send result to. When provided, the API will process asynchronously and send results to this URL when completed.
40+
"""
41+
1842
batch_size: NotRequired[int]
43+
"""
44+
The batch size to return. Maximum value is 40. This controls how the audio is chunked for processing.
45+
"""
46+
1947
chunk_duration: NotRequired[int]
48+
"""
49+
the duration of each chunk in seconds, maximum value is 15, defaults to 3
50+
"""
2051

2152

2253
class ChunkParams(TypedDict):
@@ -32,8 +63,29 @@ class BySpeakerParams(ChunkParams):
3263

3364
class SpeechToTextResponse(BaseResponse):
3465
text: str
66+
"""
67+
the text of the transcription
68+
"""
69+
3570
chunks: List[ChunkParams]
71+
"""
72+
the chunks of the transcription
73+
"""
74+
3675
speakers: Optional[List[BySpeakerParams]]
76+
"""
77+
the speakers of the transcription, available if by_speaker is set to true
78+
"""
79+
80+
language_detected: Optional[str]
81+
"""
82+
the language detected in the transcription, available if language is set to auto
83+
"""
84+
85+
confidence: Optional[float]
86+
"""
87+
the confidence of the transcription language detection, available if language is set to auto
88+
"""
3789

3890

3991
class SpeechToTextWebhookResponse(BaseResponse):

0 commit comments

Comments
 (0)