From a202f603a00389ec4330c4faeb58dd41c64153a8 Mon Sep 17 00:00:00 2001 From: Jiwon Kim Date: Mon, 5 Jan 2026 21:19:02 -0800 Subject: [PATCH 1/6] Add input.transcribe op and server.transcribe method --- chatkit/server.py | 17 +++++++++++++++ chatkit/types.py | 24 +++++++++++++++++++++ tests/test_chatkit_server.py | 42 ++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) diff --git a/chatkit/server.py b/chatkit/server.py index 19e3563..8f8c004 100644 --- a/chatkit/server.py +++ b/chatkit/server.py @@ -1,4 +1,5 @@ import asyncio +import base64 from abc import ABC, abstractmethod from collections.abc import AsyncIterator from contextlib import contextmanager @@ -41,6 +42,7 @@ ErrorEvent, FeedbackKind, HiddenContextItem, + InputTranscribeReq, ItemsFeedbackReq, ItemsListReq, NonStreamingReq, @@ -69,6 +71,7 @@ ThreadStreamEvent, ThreadsUpdateReq, ThreadUpdatedEvent, + TranscriptionResult, UserMessageInput, UserMessageItem, WidgetComponentUpdated, @@ -319,6 +322,14 @@ async def add_feedback( # noqa: B027 """Persist user feedback for one or more thread items.""" pass + async def transcribe( # noqa: B027 + self, audio_bytes: bytes, mime_type: str, context: TContext + ) -> TranscriptionResult: + """Transcribe speech audio to text. Override this method to support dictation.""" + raise NotImplementedError( + "transcribe() must be overridden to support the input.transcribe request." + ) + def action( self, thread: ThreadMetadata, @@ -446,6 +457,12 @@ async def _process_non_streaming( request.params.attachment_id, context=context ) return b"{}" + case InputTranscribeReq(): + audio_bytes = base64.b64decode(request.params.audio_base64) + transcription_result = await self.transcribe( + audio_bytes, request.params.mime_type, context=context + ) + return self._serialize(transcription_result) case ItemsListReq(): items_list_params = request.params items = await self.store.load_thread_items( diff --git a/chatkit/types.py b/chatkit/types.py index 3280b13..9c9656c 100644 --- a/chatkit/types.py +++ b/chatkit/types.py @@ -174,6 +174,29 @@ class AttachmentCreateParams(BaseModel): mime_type: str +class InputTranscribeReq(BaseReq): + """Request to transcribe an audio payload into text.""" + + type: Literal["input.transcribe"] = "input.transcribe" + params: InputTranscribeParams + + +class InputTranscribeParams(BaseModel): + """Parameters for speech transcription.""" + + audio_base64: str + """Base64-encoded audio bytes.""" + + mime_type: str + """MIME type for the audio payload (e.g. 'audio/webm', 'audio/wav').""" + + +class TranscriptionResult(BaseModel): + """Input speech transcription result.""" + + text: str + + class ItemsListReq(BaseReq): """Request to list items inside a thread.""" @@ -236,6 +259,7 @@ class ThreadDeleteParams(BaseModel): | AttachmentsDeleteReq | ThreadsUpdateReq | ThreadsDeleteReq + | InputTranscribeReq ) """Union of request types that yield immediate responses.""" diff --git a/tests/test_chatkit_server.py b/tests/test_chatkit_server.py index 750f7c2..066ffae 100644 --- a/tests/test_chatkit_server.py +++ b/tests/test_chatkit_server.py @@ -1,4 +1,5 @@ import asyncio +import base64 import sqlite3 from contextlib import contextmanager from datetime import datetime @@ -38,6 +39,8 @@ FileAttachment, ImageAttachment, InferenceOptions, + InputTranscribeParams, + InputTranscribeReq, ItemFeedbackParams, ItemsFeedbackReq, ItemsListParams, @@ -75,6 +78,7 @@ ThreadUpdatedEvent, ThreadUpdateParams, ToolChoice, + TranscriptionResult, UserMessageInput, UserMessageItem, UserMessageTextContent, @@ -159,6 +163,7 @@ def make_server( ] | None = None, file_store: AttachmentStore | None = None, + transcribe_callback: Callable[[bytes, str, Any], TranscriptionResult] | None = None, ): global server_id db_path = f"file:{server_id}?mode=memory&cache=shared" @@ -206,6 +211,13 @@ async def add_feedback( return handle_feedback(thread_id, item_ids, feedback, context) + async def transcribe( + self, audio_bytes: bytes, mime_type: str, context: Any + ) -> TranscriptionResult: + if transcribe_callback is None: + return await super().transcribe(audio_bytes, mime_type, context) + return transcribe_callback(audio_bytes, mime_type, context) + async def process_streaming( self, request_obj, context: Any | None = None ) -> list[ThreadStreamEvent]: @@ -1887,6 +1899,36 @@ async def responder( assert any(e.type == "thread.item.done" for e in events) +async def test_input_transcribe_decodes_base64_and_passes_mime_type(): + audio_bytes = b"hello audio" + audio_b64 = base64.b64encode(audio_bytes).decode("ascii") + seen: dict[str, Any] = {} + + def transcribe_callback( + audio: bytes, mime: str, context: Any + ) -> TranscriptionResult: + seen["audio"] = audio + seen["mime"] = mime + seen["context"] = context + return TranscriptionResult(text="ok") + + with make_server(transcribe_callback=transcribe_callback) as server: + result = await server.process_non_streaming( + InputTranscribeReq( + params=InputTranscribeParams( + audio_base64=audio_b64, + mime_type="audio/wav", + ) + ) + ) + parsed = TypeAdapter(TranscriptionResult).validate_json(result.json) + assert parsed.text == "ok" + + assert seen["audio"] == audio_bytes + assert seen["mime"] == "audio/wav" + assert seen["context"] == DEFAULT_CONTEXT + + async def test_retry_after_item_passes_tools_to_responder(): pass From b79fb0495b3e0fffb0cc647d4be8bb031b7c584e Mon Sep 17 00:00:00 2001 From: Jiwon Kim Date: Thu, 15 Jan 2026 15:54:41 -0800 Subject: [PATCH 2/6] update docs --- docs/guides/accept-rich-user-input.md | 43 +++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/guides/accept-rich-user-input.md b/docs/guides/accept-rich-user-input.md index 052e7eb..8bd8a4e 100644 --- a/docs/guides/accept-rich-user-input.md +++ b/docs/guides/accept-rich-user-input.md @@ -172,6 +172,49 @@ Set `ImageAttachment.preview_url` to allow the client to render thumbnails. - If your preview URLs are **permanent/public**, set `preview_url` once when creating the attachment and persist it. - If your storage uses **expiring URLs**, generate a fresh `preview_url` when returning attachment metadata (for example, in `Store.load_thread_items` and `Store.load_attachment`) rather than persisting a long-lived URL. In this case, returning a short-lived signed URL directly is the simplest approach. Alternatively, you may return a redirect that resolves to a temporary signed URL, as long as the final URL serves image bytes with appropriate CORS headers. +## Dictation: speech-to-text input + +Enable dictation so users can record audio and have it transcribed into text before sending. + +### Enable dictation in the client + +Turn on dictation in the composer: + +```ts +const chatkit = useChatKit({ + // ... + composer: { + dictation: { + enabled: true, + }, + }, +}); +``` + +This maps to `ChatKitOptions.composer.dictation`. + +### Implement `ChatKitServer.transcribe` + +When dictation is enabled, the client records audio and sends it to your backend for transcription. Implement `ChatKitServer.transcribe` to accept audio bytes and return a transcription result. + +Example transcription method using the OpenAI Audio API: + +```python +async def transcribe(self, audio_bytes: bytes, mime_type: str, context: RequestContext) -> str: + ext = "m4a" if mime_type.startswith("audio/mp4") else "webm" + audio_file = io.BytesIO(audio_bytes) + audio_file.name = f"audio.{ext}" + + client = OpenAI() + transcription = client.audio.transcriptions.create( + model="gpt-4o-transcribe", + file=audio_file + ) + return TranscriptionResult(text=transcription.text) +``` + +Return a `TranscriptionResult` that includes the final `text` that should appear in the composer. + ## @-mentions: tag entities in user messages Enable @-mentions so users can tag entities (like documents, tickets, or users) instead of pasting raw identifiers. Mentions travel through ChatKit as structured tags so the model can resolve entities instead of guessing from free text. From 9c6c0f90fa29f1c0aad97b99f86c8daeaa531829 Mon Sep 17 00:00:00 2001 From: Jiwon Kim Date: Tue, 20 Jan 2026 16:43:43 -0500 Subject: [PATCH 3/6] Use AudioInput for more ergonomic normalization & future-proofing --- chatkit/server.py | 17 ++++++++++++++--- chatkit/types.py | 37 ++++++++++++++++++++++++++++++++---- tests/test_chatkit_server.py | 23 ++++++++++++---------- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/chatkit/server.py b/chatkit/server.py index 8f8c004..dbd7d82 100644 --- a/chatkit/server.py +++ b/chatkit/server.py @@ -36,6 +36,7 @@ AssistantMessageItem, AttachmentsCreateReq, AttachmentsDeleteReq, + AudioInput, ChatKitReq, ClientToolCallItem, ErrorCode, @@ -323,9 +324,18 @@ async def add_feedback( # noqa: B027 pass async def transcribe( # noqa: B027 - self, audio_bytes: bytes, mime_type: str, context: TContext + self, audio_input: AudioInput, context: TContext ) -> TranscriptionResult: - """Transcribe speech audio to text. Override this method to support dictation.""" + """Transcribe speech audio to text (dictation). + The client sends `audio/webm;codecs=opus`, `audio/ogg;codecs=opus`, or `audio/mp4`. + + Args: + audio_input: Audio bytes plus MIME type metadata for transcription. + context: Arbitrary per-request context provided by the caller. + + Returns: + A `TranscriptionResult` whose `text` is what should appear in the composer. + """ raise NotImplementedError( "transcribe() must be overridden to support the input.transcribe request." ) @@ -460,7 +470,8 @@ async def _process_non_streaming( case InputTranscribeReq(): audio_bytes = base64.b64decode(request.params.audio_base64) transcription_result = await self.transcribe( - audio_bytes, request.params.mime_type, context=context + AudioInput(data=audio_bytes, mime_type=request.params.mime_type), + context=context, ) return self._serialize(transcription_result) case ItemsListReq(): diff --git a/chatkit/types.py b/chatkit/types.py index 9c9656c..be8d0b8 100644 --- a/chatkit/types.py +++ b/chatkit/types.py @@ -1,9 +1,16 @@ from __future__ import annotations from datetime import datetime -from typing import Any, Generic, Literal - -from pydantic import AnyUrl, BaseModel, Field, SerializationInfo, model_serializer +from typing import Any, Generic, Literal, cast + +from pydantic import ( + AnyUrl, + BaseModel, + Field, + SerializationInfo, + field_validator, + model_serializer, +) from typing_extensions import Annotated, TypeIs, TypeVar from chatkit.errors import ErrorCode @@ -188,7 +195,29 @@ class InputTranscribeParams(BaseModel): """Base64-encoded audio bytes.""" mime_type: str - """MIME type for the audio payload (e.g. 'audio/webm', 'audio/wav').""" + """MIME type for the audio payload.""" + + @field_validator("mime_type", mode="before") + @classmethod + def _normalize_mime_type(cls, v: object) -> object: + if not isinstance(v, str): + return v + return v.strip().replace("; ", ";").lower() + + +class AudioInput(BaseModel): + """Audio input data for transcription.""" + + data: bytes + """Audio data bytes.""" + + mime_type: str + """Raw MIME type for the audio payload, e.g. "audio/webm;codecs=opus".""" + + @property + def media_type(self) -> str: + """Media type for the audio payload, e.g. "audio/webm".""" + return self.mime_type.split(";", 1)[0] class TranscriptionResult(BaseModel): diff --git a/tests/test_chatkit_server.py b/tests/test_chatkit_server.py index 066ffae..f3c8640 100644 --- a/tests/test_chatkit_server.py +++ b/tests/test_chatkit_server.py @@ -33,6 +33,7 @@ AttachmentsCreateReq, AttachmentsDeleteReq, AttachmentUploadDescriptor, + AudioInput, ClientToolCallItem, CustomTask, FeedbackKind, @@ -163,7 +164,7 @@ def make_server( ] | None = None, file_store: AttachmentStore | None = None, - transcribe_callback: Callable[[bytes, str, Any], TranscriptionResult] | None = None, + transcribe_callback: Callable[[AudioInput, Any], TranscriptionResult] | None = None, ): global server_id db_path = f"file:{server_id}?mode=memory&cache=shared" @@ -212,11 +213,11 @@ async def add_feedback( handle_feedback(thread_id, item_ids, feedback, context) async def transcribe( - self, audio_bytes: bytes, mime_type: str, context: Any + self, audio_input: AudioInput, context: Any ) -> TranscriptionResult: if transcribe_callback is None: - return await super().transcribe(audio_bytes, mime_type, context) - return transcribe_callback(audio_bytes, mime_type, context) + return await super().transcribe(audio_input, context) + return transcribe_callback(audio_input, context) async def process_streaming( self, request_obj, context: Any | None = None @@ -1899,16 +1900,17 @@ async def responder( assert any(e.type == "thread.item.done" for e in events) -async def test_input_transcribe_decodes_base64_and_passes_mime_type(): +async def test_input_transcribe_decodes_base64_and_normalizes_mime_type(): audio_bytes = b"hello audio" audio_b64 = base64.b64encode(audio_bytes).decode("ascii") seen: dict[str, Any] = {} def transcribe_callback( - audio: bytes, mime: str, context: Any + audio_input: AudioInput, context: Any ) -> TranscriptionResult: - seen["audio"] = audio - seen["mime"] = mime + seen["audio"] = audio_input.data + seen["mime"] = audio_input.mime_type + seen["media_type"] = audio_input.media_type seen["context"] = context return TranscriptionResult(text="ok") @@ -1917,7 +1919,7 @@ def transcribe_callback( InputTranscribeReq( params=InputTranscribeParams( audio_base64=audio_b64, - mime_type="audio/wav", + mime_type="audio/webm; codecs=opus", ) ) ) @@ -1925,7 +1927,8 @@ def transcribe_callback( assert parsed.text == "ok" assert seen["audio"] == audio_bytes - assert seen["mime"] == "audio/wav" + assert seen["mime"] == "audio/webm;codecs=opus" + assert seen["media_type"] == "audio/webm" assert seen["context"] == DEFAULT_CONTEXT From 55114842dbae75e17bcf1ac78213ed0c4038c03c Mon Sep 17 00:00:00 2001 From: Jiwon Kim Date: Tue, 20 Jan 2026 16:57:57 -0500 Subject: [PATCH 4/6] Updated docs code snippet --- chatkit/server.py | 5 +++- chatkit/types.py | 12 ++------- docs/guides/accept-rich-user-input.md | 37 ++++++++++++++++++--------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/chatkit/server.py b/chatkit/server.py index dbd7d82..048e0b7 100644 --- a/chatkit/server.py +++ b/chatkit/server.py @@ -327,7 +327,10 @@ async def transcribe( # noqa: B027 self, audio_input: AudioInput, context: TContext ) -> TranscriptionResult: """Transcribe speech audio to text (dictation). - The client sends `audio/webm;codecs=opus`, `audio/ogg;codecs=opus`, or `audio/mp4`. + + The client sends one of: `audio/webm;codecs=opus`, `audio/mp4`, `audio/ogg;codecs=opus`. + Audio bytes are in `audio_input.data`. The raw MIME type is `audio_input.mime_type`; use + `audio_input.media_type` for the base media type (`audio/webm`, `audio/ogg`, `audio/mp4`). Args: audio_input: Audio bytes plus MIME type metadata for transcription. diff --git a/chatkit/types.py b/chatkit/types.py index be8d0b8..ba56e58 100644 --- a/chatkit/types.py +++ b/chatkit/types.py @@ -1,14 +1,13 @@ from __future__ import annotations from datetime import datetime -from typing import Any, Generic, Literal, cast +from typing import Any, Generic, Literal from pydantic import ( AnyUrl, BaseModel, Field, SerializationInfo, - field_validator, model_serializer, ) from typing_extensions import Annotated, TypeIs, TypeVar @@ -195,14 +194,7 @@ class InputTranscribeParams(BaseModel): """Base64-encoded audio bytes.""" mime_type: str - """MIME type for the audio payload.""" - - @field_validator("mime_type", mode="before") - @classmethod - def _normalize_mime_type(cls, v: object) -> object: - if not isinstance(v, str): - return v - return v.strip().replace("; ", ";").lower() + """Raw MIME type for the audio payload, e.g. "audio/webm;codecs=opus".""" class AudioInput(BaseModel): diff --git a/docs/guides/accept-rich-user-input.md b/docs/guides/accept-rich-user-input.md index 8bd8a4e..5c28564 100644 --- a/docs/guides/accept-rich-user-input.md +++ b/docs/guides/accept-rich-user-input.md @@ -195,22 +195,35 @@ This maps to `ChatKitOptions.composer.dictation`. ### Implement `ChatKitServer.transcribe` -When dictation is enabled, the client records audio and sends it to your backend for transcription. Implement `ChatKitServer.transcribe` to accept audio bytes and return a transcription result. +When dictation is enabled, the client records audio and sends it to your backend for transcription. Implement `ChatKitServer.transcribe` to accept audio input and return a transcription result. + +The client sends one of: + +- `"audio/webm;codecs=opus"` (preferred for Chrome/Firefox/Safari 18.4+) +- `"audio/mp4"` (fallback for older Safari/iOS) +- `"audio/ogg;codecs=opus"` (alternative for some environments) + +The raw value is available as `audio_input.mime_type`. Use `audio_input.media_type` when you only need the base media type (`"audio/webm"`, `"audio/ogg"`, or `"audio/mp4"`). Example transcription method using the OpenAI Audio API: ```python -async def transcribe(self, audio_bytes: bytes, mime_type: str, context: RequestContext) -> str: - ext = "m4a" if mime_type.startswith("audio/mp4") else "webm" - audio_file = io.BytesIO(audio_bytes) - audio_file.name = f"audio.{ext}" - - client = OpenAI() - transcription = client.audio.transcriptions.create( - model="gpt-4o-transcribe", - file=audio_file - ) - return TranscriptionResult(text=transcription.text) +async def transcribe(self, audio_input: AudioInput, context: RequestContext) -> TranscriptionResult: + ext = { + "audio/webm": "webm", + "audio/mp4": "m4a", + "audio/ogg": "ogg", + }.get(audio_input.media_type) + if not ext: + raise HTTPException(status_code=400, detail="Unexpected audio format") + + audio_file = io.BytesIO(audio_input.data) + audio_file.name = f"audio.{ext}" + transcription = client.audio.transcriptions.create( + model="gpt-4o-transcribe", + file=audio_file + ) + return TranscriptionResult(text=transcription.text) ``` Return a `TranscriptionResult` that includes the final `text` that should appear in the composer. From 8585d1fb1625bf73121419cdc21a51e2e4059ccf Mon Sep 17 00:00:00 2001 From: Jiwon Kim Date: Wed, 21 Jan 2026 00:28:20 -0500 Subject: [PATCH 5/6] Remove deprecation warning for Action classes which are used by ChatKitServer --- chatkit/actions.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/chatkit/actions.py b/chatkit/actions.py index b5c04d0..e00b3ad 100644 --- a/chatkit/actions.py +++ b/chatkit/actions.py @@ -3,7 +3,6 @@ from typing import Any, Generic, Literal, TypeVar, get_args, get_origin from pydantic import BaseModel, Field -from typing_extensions import deprecated Handler = Literal["client", "server"] LoadingBehavior = Literal["auto", "none", "self", "container"] @@ -12,14 +11,6 @@ DEFAULT_LOADING_BEHAVIOR: LoadingBehavior = "auto" -_direct_usage_of_action_classes_deprecated = deprecated( - "Direct usage of named action classes is deprecated. " - "Use WidgetTemplate to build widgets from .widget files instead. " - "Visit https://widgets.chatkit.studio/ to author widget files." -) - - -@_direct_usage_of_action_classes_deprecated class ActionConfig(BaseModel): type: str payload: Any = None @@ -31,7 +22,6 @@ class ActionConfig(BaseModel): TPayload = TypeVar("TPayload") -@_direct_usage_of_action_classes_deprecated class Action(BaseModel, Generic[TType, TPayload]): type: TType = Field(default=TType, frozen=True) # pyright: ignore payload: TPayload = None # pyright: ignore - default to None to allow no-payload actions From 4e1156dd81f931ed2f455197737892dd59437ab2 Mon Sep 17 00:00:00 2001 From: Jiwon Kim Date: Wed, 21 Jan 2026 00:29:25 -0500 Subject: [PATCH 6/6] fix test and docstring --- chatkit/server.py | 3 +-- tests/test_chatkit_server.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/chatkit/server.py b/chatkit/server.py index 048e0b7..4465a88 100644 --- a/chatkit/server.py +++ b/chatkit/server.py @@ -328,9 +328,8 @@ async def transcribe( # noqa: B027 ) -> TranscriptionResult: """Transcribe speech audio to text (dictation). - The client sends one of: `audio/webm;codecs=opus`, `audio/mp4`, `audio/ogg;codecs=opus`. Audio bytes are in `audio_input.data`. The raw MIME type is `audio_input.mime_type`; use - `audio_input.media_type` for the base media type (`audio/webm`, `audio/ogg`, `audio/mp4`). + `audio_input.media_type` for the base media type (one of: `audio/webm`, `audio/ogg`, `audio/mp4`). Args: audio_input: Audio bytes plus MIME type metadata for transcription. diff --git a/tests/test_chatkit_server.py b/tests/test_chatkit_server.py index f3c8640..5a5b545 100644 --- a/tests/test_chatkit_server.py +++ b/tests/test_chatkit_server.py @@ -1919,7 +1919,7 @@ def transcribe_callback( InputTranscribeReq( params=InputTranscribeParams( audio_base64=audio_b64, - mime_type="audio/webm; codecs=opus", + mime_type="audio/webm;codecs=opus", ) ) )