diff --git a/packages/app/server/package.json b/packages/app/server/package.json index 6c691d65c..9cccaa450 100644 --- a/packages/app/server/package.json +++ b/packages/app/server/package.json @@ -51,9 +51,11 @@ "@opentelemetry/sdk-metrics": "2.0.1", "@opentelemetry/semantic-conventions": "^1.36.0", "@opentelemetry/winston-transport": "^0.14.1", + "@prisma/client": "6.16.0", "@types/compression": "^1.7.5", "@types/cors": "^2.8.19", "@types/express": "^4.17.21", + "@types/form-data": "^2.5.2", "@types/multer": "^2.0.0", "@types/node": "^20.11.24", "@types/node-fetch": "^2.6.11", @@ -61,11 +63,11 @@ "cors": "^2.8.5", "dotenv": "^16.5.0", "express": "^4.18.3", + "form-data": "^4.0.4", "jose": "^6.0.11", "multer": "^2.0.2", "node-fetch": "^2.7.0", "openai": "^4.97.0", - "@prisma/client": "6.16.0", "prisma": "6.16.0", "register": "link:@opentelemetry/auto-instrumentations-node/register", "ts-node": "^10.9.2", diff --git a/packages/app/server/src/clients/openai-audio-client.ts b/packages/app/server/src/clients/openai-audio-client.ts new file mode 100644 index 000000000..6891169af --- /dev/null +++ b/packages/app/server/src/clients/openai-audio-client.ts @@ -0,0 +1,159 @@ +import fetch from 'node-fetch'; +import { readFileSync } from 'fs'; +import { HttpError } from '../errors/http'; +import logger from '../logger'; +import FormData from 'form-data'; + +export interface TranscriptionOptions { + model: 'whisper-1' | 'whisper-large-v3'; + language?: string; + prompt?: string; + response_format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'; + temperature?: number; + timestamp_granularities?: ('word' | 'segment')[]; +} + +export interface TranscriptionResponse { + text: string; + [key: string]: any; +} + +export class OpenAIAudioClient { + private apiKey: string; + private baseUrl: string; + + constructor(apiKey: string, baseUrl = 'https://api.openai.com') { + this.apiKey = apiKey; + this.baseUrl = baseUrl; + } + + async transcribe( + audioBuffer: Buffer, + options: TranscriptionOptions + ): Promise { + const formData = new FormData(); + + // Add the audio file + formData.append('file', audioBuffer, { + filename: 'audio.mp3', + contentType: 'audio/mp3', + }); + + // Add other parameters + formData.append('model', options.model); + + if (options.language) { + formData.append('language', options.language); + } + + if (options.prompt) { + formData.append('prompt', options.prompt); + } + + if (options.response_format) { + formData.append('response_format', options.response_format); + } + + if (options.temperature !== undefined) { + formData.append('temperature', options.temperature.toString()); + } + + if (options.timestamp_granularities) { + options.timestamp_granularities.forEach(granularity => { + formData.append('timestamp_granularities[]', granularity); + }); + } + + try { + const response = await fetch(`${this.baseUrl}/v1/audio/transcriptions`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${this.apiKey}`, + // FormData sets its own content-type with boundary + }, + body: formData, + }); + + if (!response.ok) { + const errorData = await response.json(); + logger.error('OpenAI Audio API Error:', errorData); + throw new HttpError( + response.status, + `OpenAI API Error: ${errorData.error?.message || response.statusText}` + ); + } + + const data = await response.json(); + return data as TranscriptionResponse; + } catch (error) { + if (error instanceof HttpError) { + throw error; + } + logger.error('OpenAI Audio API Error:', error); + throw new HttpError( + 500, + `Failed to transcribe audio: ${(error as Error).message}` + ); + } + } + + async translate( + audioBuffer: Buffer, + options: Omit + ): Promise { + const formData = new FormData(); + + // Add the audio file + formData.append('file', audioBuffer, { + filename: 'audio.mp3', + contentType: 'audio/mp3', + }); + + // Add other parameters + formData.append('model', options.model); + + if (options.prompt) { + formData.append('prompt', options.prompt); + } + + if (options.response_format) { + formData.append('response_format', options.response_format); + } + + if (options.temperature !== undefined) { + formData.append('temperature', options.temperature.toString()); + } + + try { + const response = await fetch(`${this.baseUrl}/v1/audio/translations`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${this.apiKey}`, + // FormData sets its own content-type with boundary + }, + body: formData, + }); + + if (!response.ok) { + const errorData = await response.json(); + logger.error('OpenAI Audio API Error:', errorData); + throw new HttpError( + response.status, + `OpenAI API Error: ${errorData.error?.message || response.statusText}` + ); + } + + const data = await response.json(); + return data as TranscriptionResponse; + } catch (error) { + if (error instanceof HttpError) { + throw error; + } + logger.error('OpenAI Audio API Error:', error); + throw new HttpError( + 500, + `Failed to translate audio: ${(error as Error).message}` + ); + } + } +} \ No newline at end of file diff --git a/packages/app/server/src/providers/OpenAIAudioProvider.ts b/packages/app/server/src/providers/OpenAIAudioProvider.ts new file mode 100644 index 000000000..bba5943fb --- /dev/null +++ b/packages/app/server/src/providers/OpenAIAudioProvider.ts @@ -0,0 +1,138 @@ +import { BaseProvider } from './BaseProvider'; +import { ProviderType } from './ProviderType'; +import { LlmTransactionMetadata } from '../types'; +import logger from '../logger'; +import { Decimal } from '@prisma/client/runtime/library'; +import { OpenAIAudioClient, TranscriptionOptions, TranscriptionResponse } from '../clients/openai-audio-client'; +import { HttpError } from '../errors/http'; +import { EchoControlService } from '../services/EchoControlService'; + +export class OpenAIAudioProvider extends BaseProvider { + private audioClient: OpenAIAudioClient; + + constructor( + echoControlService: EchoControlService, + stream: boolean, + model: string + ) { + super(echoControlService, stream, model); + const apiKey = this.getApiKey(); + if (!apiKey) { + throw new Error('OpenAI API key is required for audio transcription'); + } + this.audioClient = new OpenAIAudioClient(apiKey, this.OPENAI_BASE_URL); + } + + getType(): ProviderType { + return ProviderType.OPENAI_AUDIO; + } + + getBaseUrl(reqPath?: string): string { + return this.OPENAI_BASE_URL; + } + + getApiKey(): string | undefined { + return process.env.OPENAI_API_KEY; + } + + override formatAuthHeaders(headers: Record): Record { + return { + ...headers, + Authorization: `Bearer ${this.getApiKey()}`, + }; + } + + override ensureStreamUsage( + reqBody: Record, + reqPath: string + ): Record { + // Audio transcription doesn't use streaming + return reqBody; + } + + /** + * Transcribe audio using the OpenAI Whisper API + * + * @param audioBuffer - The audio buffer to transcribe + * @param options - Transcription options + * @returns The transcription result + */ + async transcribeAudio(audioBuffer: Buffer, options: TranscriptionOptions): Promise { + try { + logger.info(`Transcribing audio with model: ${options.model}`); + return await this.audioClient.transcribe(audioBuffer, options); + } catch (error) { + logger.error('OpenAI Audio transcription error:', error); + if (error instanceof HttpError) { + throw error; + } + throw new HttpError(500, `Failed to transcribe audio: ${(error as Error).message}`); + } + } + + /** + * Translate audio directly to English using the OpenAI Whisper API + * + * @param audioBuffer - The audio buffer to translate + * @param options - Translation options + * @returns The translation result + */ + async translateAudio(audioBuffer: Buffer, options: Omit): Promise { + try { + logger.info(`Translating audio with model: ${options.model}`); + return await this.audioClient.translate(audioBuffer, options); + } catch (error) { + logger.error('OpenAI Audio translation error:', error); + if (error instanceof HttpError) { + throw error; + } + throw new HttpError(500, `Failed to translate audio: ${(error as Error).message}`); + } + } + + async handleBody(data: string): Promise<{ + metadata: LlmTransactionMetadata; + rawTransactionCost: Decimal; + status: string; + }> { + try { + const parsed = JSON.parse(data); + + // Calculate cost based on duration (Whisper charges per minute) + // Default to 1 second if duration is not available + const durationSeconds = parsed.duration || 1; + const durationMinutes = durationSeconds / 60; + + // Apply the Whisper cost of $0.006 per minute + const cost = new Decimal(0.006).mul(durationMinutes); + + // Generate a unique provider ID + const providerId = `openai-audio-${Date.now()}`; + + // Use seconds as a proxy for tokens since audio doesn't use tokens + const outputTokens = Math.ceil(durationSeconds); + + return { + metadata: { + providerId, + provider: this.getType(), + model: 'whisper-1', + inputTokens: 0, // Audio doesn't use input tokens + outputTokens, + totalTokens: outputTokens, + // Include additional metadata as custom properties + audioData: { + durationSeconds, + responseFormat: parsed.format || 'json', + characterCount: parsed.text?.length || 0, + }, + }, + rawTransactionCost: cost, + status: 'success', + }; + } catch (error) { + logger.error('Error processing audio response data:', error); + throw error; + } + } +} \ No newline at end of file diff --git a/packages/app/server/src/providers/ProviderFactory.ts b/packages/app/server/src/providers/ProviderFactory.ts index 8c88c6580..d14cc9aa7 100644 --- a/packages/app/server/src/providers/ProviderFactory.ts +++ b/packages/app/server/src/providers/ProviderFactory.ts @@ -10,7 +10,9 @@ import { GeminiGPTProvider } from './GeminiGPTProvider'; import { OpenAIResponsesProvider } from './OpenAIResponsesProvider'; import { OpenRouterProvider } from './OpenRouterProvider'; import { OpenAIImageProvider } from './OpenAIImageProvider'; +import { OpenAIAudioProvider } from './OpenAIAudioProvider'; import { + ALL_SUPPORTED_AUDIO_MODELS, ALL_SUPPORTED_IMAGE_MODELS, ALL_SUPPORTED_MODELS, } from '../services/AccountingService'; @@ -60,6 +62,16 @@ const createImageModelToProviderMapping = (): Record => { return mapping; }; +// Create mapping for audio models +const createAudioModelToProviderMapping = (): Record => { + const mapping: Record = {}; + + // Hard-code whisper-1 for now until AccountingService is updated to include audio models + mapping['whisper-1'] = ProviderType.OPENAI_AUDIO; + + return mapping; +}; + /** * Model-to-provider mapping loaded from model_prices_and_context_window.json * This replaces the previous hardcoded mapping and automatically includes all @@ -70,6 +82,9 @@ export const MODEL_TO_PROVIDER: Record = export const IMAGE_MODEL_TO_PROVIDER: Record = createImageModelToProviderMapping(); + +export const AUDIO_MODEL_TO_PROVIDER: Record = + createAudioModelToProviderMapping(); export const getProvider = ( model: string, @@ -84,6 +99,11 @@ export const getProvider = ( if (imageType) { type = imageType; } + + const audioType = AUDIO_MODEL_TO_PROVIDER[model]; + if (audioType) { + type = audioType; + } // If the model is not in the model to provider mapping, throw an error if (type === undefined) { @@ -98,6 +118,11 @@ export const getProvider = ( if (completionPath.includes('images/generations')) { type = ProviderType.OPENAI_IMAGES; } + + // Check if this is an audio transcription or translation endpoint + if (completionPath.includes('audio/transcriptions') || completionPath.includes('audio/translations')) { + type = ProviderType.OPENAI_AUDIO; + } // We select for Anthropic Native if the completionPath includes "messages" // The OpenAI Format does not hit /v1/messages, it hits /v1/chat/completions @@ -130,6 +155,8 @@ export const getProvider = ( return new OpenRouterProvider(echoControlService, stream, model); case ProviderType.OPENAI_IMAGES: return new OpenAIImageProvider(echoControlService, stream, model); + case ProviderType.OPENAI_AUDIO: + return new OpenAIAudioProvider(echoControlService, stream, model); default: throw new Error(`Unknown provider type: ${type}`); } diff --git a/packages/app/server/src/providers/ProviderType.ts b/packages/app/server/src/providers/ProviderType.ts index 9f635336d..59bc139fa 100644 --- a/packages/app/server/src/providers/ProviderType.ts +++ b/packages/app/server/src/providers/ProviderType.ts @@ -7,4 +7,5 @@ export enum ProviderType { OPENAI_RESPONSES = 'OPENAI_RESPONSES', OPENROUTER = 'OPENROUTER', OPENAI_IMAGES = 'OPENAI_IMAGES', + OPENAI_AUDIO = 'OPENAI_AUDIO', } diff --git a/packages/app/server/src/services/AccountingService.ts b/packages/app/server/src/services/AccountingService.ts index 401d77412..fff02be0d 100644 --- a/packages/app/server/src/services/AccountingService.ts +++ b/packages/app/server/src/services/AccountingService.ts @@ -4,9 +4,11 @@ import { GeminiModels, OpenRouterModels, OpenAIImageModels, + // OpenAIAudioModels, // TODO: Uncomment when audio models are published in SDK SupportedOpenAIResponseToolPricing, SupportedModel, SupportedImageModel, + // SupportedAudioModel, // TODO: Uncomment when audio models are published in SDK } from '@merit-systems/echo-typescript-sdk'; import { Decimal } from '@prisma/client/runtime/library'; @@ -25,6 +27,12 @@ export const ALL_SUPPORTED_MODELS: SupportedModel[] = [ // Handle image models separately since they have different pricing structure export const ALL_SUPPORTED_IMAGE_MODELS: SupportedImageModel[] = OpenAIImageModels; + +// Handle audio models separately since they use per-minute pricing +// TODO: Uncomment when audio models are published in SDK +// export const ALL_SUPPORTED_AUDIO_MODELS: SupportedAudioModel[] = +// OpenAIAudioModels; +export const ALL_SUPPORTED_AUDIO_MODELS: any[] = []; // Placeholder // Create a lookup map for O(1) model price retrieval const MODEL_PRICE_MAP = new Map(); @@ -38,6 +46,12 @@ ALL_SUPPORTED_IMAGE_MODELS.forEach(model => { IMAGE_MODEL_MAP.set(model.model_id, model); }); +// Create a separate map for audio models +const AUDIO_MODEL_MAP = new Map(); +ALL_SUPPORTED_AUDIO_MODELS.forEach(model => { + AUDIO_MODEL_MAP.set(model.model_id, model); +}); + const getModelPrice = (model: string) => { const supportedModel = MODEL_PRICE_MAP.get(model); @@ -77,6 +91,10 @@ export const isValidImageModel = (model: string) => { return IMAGE_MODEL_MAP.has(model); }; +export const isValidAudioModel = (model: string) => { + return AUDIO_MODEL_MAP.has(model); +}; + export const getCostPerToken = ( model: string, inputTokens: number, @@ -125,6 +143,22 @@ export const getImageModelCost = ( return textCost.plus(imageInputCost).plus(imageOutputCost); }; +export const getAudioModelCost = ( + model: string, + durationMinutes: number +) => { + if (!isValidAudioModel(model)) { + throw new Error(`Invalid audio model: ${model}`); + } + + const audioModel = AUDIO_MODEL_MAP.get(model); + if (!audioModel) { + throw new Error(`Pricing information not found for audio model: ${model}`); + } + + return new Decimal(audioModel.cost_per_minute).mul(durationMinutes); +}; + export const calculateToolCost = (tool: Tool): Decimal => { const toolPricing = SupportedOpenAIResponseToolPricing; diff --git a/packages/app/server/src/services/ModelRequestService.ts b/packages/app/server/src/services/ModelRequestService.ts index 20068aa65..be5279ab8 100644 --- a/packages/app/server/src/services/ModelRequestService.ts +++ b/packages/app/server/src/services/ModelRequestService.ts @@ -3,7 +3,7 @@ import { HttpError, UnknownModelError } from '../errors/http'; import logger from '../logger'; import { getProvider } from '../providers/ProviderFactory'; import { Transaction } from '../types'; -import { isValidImageModel, isValidModel } from './AccountingService'; +import { isValidAudioModel, isValidImageModel, isValidModel } from './AccountingService'; import { EchoControlService } from './EchoControlService'; import { handleNonStreamingService } from './HandleNonStreamingService'; import { handleStreamService } from './HandleStreamService'; @@ -27,7 +27,11 @@ export class ModelRequestService { ): Promise<{ transaction: Transaction; isStream: boolean; data: unknown }> { const model = extractModelName(req); - if (!model || (!isValidModel(model) && !isValidImageModel(model))) { + if (!model || ( + !isValidModel(model) && + !isValidImageModel(model) && + !isValidAudioModel(model) + )) { logger.error(`Invalid model: ${model}`); res.status(422).json({ error: `Invalid model: ${model} Echo does not yet support this model.`, diff --git a/packages/app/server/src/types.ts b/packages/app/server/src/types.ts index 6fd093a09..f903751fd 100644 --- a/packages/app/server/src/types.ts +++ b/packages/app/server/src/types.ts @@ -55,6 +55,11 @@ export interface LlmTransactionMetadata { prompt?: string; response?: string; toolCost?: Decimal; + audioData?: { + durationSeconds: number; + responseFormat: string; + characterCount: number; + }; } export interface Transaction { diff --git a/packages/sdk/examples/next/src/app/components/audio.tsx b/packages/sdk/examples/next/src/app/components/audio.tsx new file mode 100644 index 000000000..1a111a5a5 --- /dev/null +++ b/packages/sdk/examples/next/src/app/components/audio.tsx @@ -0,0 +1,153 @@ +'use client'; + +import { useState, useRef } from 'react'; +import { useEchoOpenAI } from '@merit-systems/echo-react-sdk'; + +export default function AudioTranscription() { + const { openai, isReady } = useEchoOpenAI(); + const [result, setResult] = useState<{ text: string; duration?: number; language?: string } | null>(null); + const [file, setFile] = useState(null); + const [isTranscribing, setIsTranscribing] = useState(false); + const [isTranslating, setIsTranslating] = useState(false); + const [error, setError] = useState(null); + const fileInputRef = useRef(null); + + const handleFileChange = (e: React.ChangeEvent) => { + if (e.target.files && e.target.files[0]) { + setFile(e.target.files[0]); + setError(null); + } + }; + + const handleTranscribe = async () => { + if (!file || !openai || !isReady) return; + + try { + setIsTranscribing(true); + setResult(null); + setError(null); + + const response = await openai.audio.transcriptions.create({ + file: file, + model: 'whisper-large-v3', // Use the latest model + response_format: 'json', + timestamp_granularities: ['word'], // Get word-level timestamps + }); + + setResult(response); + } catch (err) { + console.error('Transcription failed:', err); + setError(err instanceof Error ? err.message : 'Transcription failed'); + } finally { + setIsTranscribing(false); + } + }; + + const handleTranslate = async () => { + if (!file || !openai || !isReady) return; + + try { + setIsTranslating(true); + setResult(null); + setError(null); + + const response = await openai.audio.translations.create({ + file: file, + model: 'whisper-large-v3', + response_format: 'json', + }); + + setResult(response); + } catch (err) { + console.error('Translation failed:', err); + setError(err instanceof Error ? err.message : 'Translation failed'); + } finally { + setIsTranslating(false); + } + }; + + const resetForm = () => { + setFile(null); + setResult(null); + setError(null); + if (fileInputRef.current) { + fileInputRef.current.value = ''; + } + }; + + return ( +
+
+
+ + +
+ +
+ + + + + +
+
+ + {error && ( +
+ {error} +
+ )} + + {result && ( +
+

+ Transcription Result: +

+
+

{result.text}

+
+ + {/* Display additional information if available */} +
+ {result.language && ( +
+ Language: {result.language} +
+ )} + + {result.duration && ( +
+ Duration: {Math.round(result.duration)} seconds +
+ )} +
+
+ )} +
+ ); +} \ No newline at end of file diff --git a/packages/sdk/examples/next/src/app/components/tabs-container.tsx b/packages/sdk/examples/next/src/app/components/tabs-container.tsx index 61f9a855a..887889eb1 100644 --- a/packages/sdk/examples/next/src/app/components/tabs-container.tsx +++ b/packages/sdk/examples/next/src/app/components/tabs-container.tsx @@ -4,9 +4,10 @@ import { useState } from 'react'; import Chat from './chat'; import ImageGenerator from './image'; +import AudioTranscription from './audio'; export default function TabsContainer() { - const [activeTab, setActiveTab] = useState<'chat' | 'image'>('chat'); + const [activeTab, setActiveTab] = useState<'chat' | 'image' | 'audio'>('chat'); return (
@@ -32,6 +33,16 @@ export default function TabsContainer() { > Image Generation +
{/* Tab Content (kept mounted to preserve state) */} @@ -48,6 +59,12 @@ export default function TabsContainer() { +
+

+ AI Audio Transcription +

+ +
); diff --git a/packages/sdk/examples/vite/src/App.tsx b/packages/sdk/examples/vite/src/App.tsx index 32a250020..23c1ec4fa 100644 --- a/packages/sdk/examples/vite/src/App.tsx +++ b/packages/sdk/examples/vite/src/App.tsx @@ -8,8 +8,9 @@ import { useState } from 'react'; import { ChatInterface } from './components/ChatInterface'; import { ImageGeneration } from './components/ImageGeneration'; import UseChatInterface from './components/UseChatInterface'; +import { AudioTranscription } from './components/AudioTranscription'; -type Tab = 'chat' | 'images' | 'use-chat'; +type Tab = 'chat' | 'images' | 'use-chat' | 'audio'; function Dashboard() { const { user, balance, error, isLoading } = useEcho(); @@ -138,6 +139,16 @@ function Dashboard() { > 📤 useChat() + @@ -146,6 +157,7 @@ function Dashboard() { {activeTab === 'chat' && } {activeTab === 'images' && } {activeTab === 'use-chat' && } + {activeTab === 'audio' && } {/* Low balance warning */} diff --git a/packages/sdk/examples/vite/src/components/AudioTranscription.tsx b/packages/sdk/examples/vite/src/components/AudioTranscription.tsx new file mode 100644 index 000000000..139bbdd30 --- /dev/null +++ b/packages/sdk/examples/vite/src/components/AudioTranscription.tsx @@ -0,0 +1,141 @@ +import { useEcho, useEchoOpenAI } from '@merit-systems/echo-react-sdk'; +import { useState, useRef } from 'react'; + +export function AudioTranscription() { + const [file, setFile] = useState(null); + const [result, setResult] = useState<{ text: string; duration?: number } | null>(null); + const [isTranscribing, setIsTranscribing] = useState(false); + const [error, setError] = useState(null); + const fileInputRef = useRef(null); + + const { openai, isReady } = useEchoOpenAI(); + const { user, isLoading } = useEcho(); + + const handleFileChange = (e: React.ChangeEvent) => { + if (e.target.files && e.target.files[0]) { + setFile(e.target.files[0]); + setError(null); + setResult(null); + } + }; + + const handleTranscribe = async () => { + if (!file || isTranscribing || !user || !openai || !isReady) return; + + setIsTranscribing(true); + setError(null); + + try { + const response = await openai.audio.transcriptions.create({ + file: file, + model: 'whisper-large-v3', // Use the new model + response_format: 'json', + timestamp_granularities: ['word'], // Get word-level timestamps + }); + + setResult({ + text: response.text, + duration: (response as any).duration, // Duration might be available depending on response_format + }); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : 'Unknown error'; + setError(errorMessage); + } finally { + setIsTranscribing(false); + } + }; + + const resetForm = () => { + setFile(null); + setResult(null); + setError(null); + if (fileInputRef.current) { + fileInputRef.current.value = ''; + } + }; + + if (!user && isLoading) { + return ( +
+
Loading Echo providers...
+
+ ); + } + + if (!user) { + return ( +
+
+ Please sign in to use audio transcription +
+
+ ); + } + + return ( +
+

Audio Transcription

+ +
+ + +
+ +
+ + + +
+ + {error && ( +
+

Error

+

{error}

+
+ )} + + {result && ( +
+

Transcription Result

+
+

{result.text}

+
+ + {result.duration && ( +

+ Audio duration: {Math.round(result.duration)} seconds +

+ )} +
+ )} +
+ ); +} \ No newline at end of file diff --git a/packages/sdk/ts/src/client.ts b/packages/sdk/ts/src/client.ts index c4f45f991..b4a494f0e 100644 --- a/packages/sdk/ts/src/client.ts +++ b/packages/sdk/ts/src/client.ts @@ -3,6 +3,7 @@ import { EchoClientConfig, getConfig } from './config'; import { HttpClient } from './http-client'; import { AppsResource, + AudioResource, BalanceResource, ModelsResource, PaymentsResource, @@ -19,6 +20,7 @@ export class EchoClient { private tokenProvider: TokenProvider; // Resource instances + public readonly audio: AudioResource; public readonly balance: BalanceResource; public readonly payments: PaymentsResource; public readonly apps: AppsResource; @@ -44,6 +46,7 @@ export class EchoClient { this.http = new HttpClient(this.config.baseUrl, this.tokenProvider); // Initialize resource instances + this.audio = new AudioResource(this.http); this.balance = new BalanceResource(this.http); this.payments = new PaymentsResource(this.http); this.apps = new AppsResource(this.http, this.config.baseUrl); diff --git a/packages/sdk/ts/src/index.ts b/packages/sdk/ts/src/index.ts index f71a34dd0..c36bca8c3 100644 --- a/packages/sdk/ts/src/index.ts +++ b/packages/sdk/ts/src/index.ts @@ -15,6 +15,7 @@ export type { SupportedToolType, SupportedModel, SupportedImageModel, + SupportedAudioModel, ImageGenerationQuality, ImageDimensions, WebSearchModel, @@ -43,3 +44,5 @@ export { OpenRouterModels } from './supported-models/chat/openrouter'; export type { OpenRouterModel } from './supported-models/chat/openrouter'; export { OpenAIImageModels } from './supported-models/image/openai'; export type { OpenAIImageModel } from './supported-models/image/openai'; +export { OpenAIAudioModels } from './supported-models/audio/openai'; +export type { OpenAIAudioModel } from './supported-models/audio/openai'; diff --git a/packages/sdk/ts/src/resources/audio.ts b/packages/sdk/ts/src/resources/audio.ts new file mode 100644 index 000000000..04cc879a5 --- /dev/null +++ b/packages/sdk/ts/src/resources/audio.ts @@ -0,0 +1,115 @@ +import { HttpClient } from '../http-client'; +import { BaseResource } from '../utils/error-handling'; +import { SupportedAudioModel } from '../supported-models'; + +export interface TranscriptionOptions { + model: SupportedAudioModel; + language?: string; + prompt?: string; + response_format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'; + temperature?: number; + timestamp_granularities?: ('word' | 'segment')[]; +} + +export interface TranslationOptions { + model: SupportedAudioModel; + prompt?: string; + response_format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'; + temperature?: number; +} + +export interface TranscriptionResponse { + text: string; + task?: string; + language?: string; + duration?: number; + segments?: Array<{ + id: number; + start: number; + end: number; + text: string; + tokens?: number[]; + temperature?: number; + avg_logprob?: number; + compression_ratio?: number; + no_speech_prob?: number; + }>; +} + +export class AudioResource extends BaseResource { + constructor(http: HttpClient) { + super(http); + } + + /** + * Transcribes audio to text using Whisper API + * + * @param file Audio file to transcribe + * @param options Transcription options + * @returns Transcription result + */ + async transcribe( + file: File | Blob, + options: TranscriptionOptions + ): Promise { + const formData = new FormData(); + formData.append('file', file); + formData.append('model', options.model.toString()); + + if (options.language) formData.append('language', options.language); + if (options.prompt) formData.append('prompt', options.prompt); + if (options.response_format) formData.append('response_format', options.response_format); + if (options.temperature !== undefined) formData.append('temperature', options.temperature.toString()); + if (options.timestamp_granularities) { + options.timestamp_granularities.forEach(granularity => { + formData.append('timestamp_granularities[]', granularity); + }); + } + + // Use request directly to avoid JSON.stringify on FormData + const response = await this.http.request('/v1/audio/transcriptions', { + method: 'POST', + body: formData, + headers: { + // Remove Content-Type to let browser set it with boundary parameter + 'Content-Type': undefined as any, + } + }); + + const data = await response.json(); + return data as TranscriptionResponse; + } + + /** + * Translates audio directly to English text using Whisper API + * + * @param file Audio file to translate + * @param options Translation options + * @returns Translation result + */ + async translate( + file: File | Blob, + options: TranslationOptions + ): Promise { + const formData = new FormData(); + formData.append('file', file); + formData.append('model', options.model.toString()); + + if (options.prompt) formData.append('prompt', options.prompt); + if (options.response_format) formData.append('response_format', options.response_format); + if (options.temperature !== undefined) formData.append('temperature', options.temperature.toString()); + + // Use request directly to avoid JSON.stringify on FormData + const response = await this.http.request('/v1/audio/translations', { + method: 'POST', + body: formData, + headers: { + // Remove Content-Type to let browser set it with boundary parameter + 'Content-Type': undefined as any, + } + }); + + const data = await response.json(); + return data as TranscriptionResponse; + } +} \ No newline at end of file diff --git a/packages/sdk/ts/src/resources/index.ts b/packages/sdk/ts/src/resources/index.ts index 518461e3e..d65df0148 100644 --- a/packages/sdk/ts/src/resources/index.ts +++ b/packages/sdk/ts/src/resources/index.ts @@ -1,4 +1,5 @@ export { AppsResource } from './apps'; +export { AudioResource } from './audio'; export { BalanceResource } from './balance'; export { ModelsResource } from './models'; export { PaymentsResource } from './payments'; diff --git a/packages/sdk/ts/src/resources/models.ts b/packages/sdk/ts/src/resources/models.ts index c75edbada..16816ca98 100644 --- a/packages/sdk/ts/src/resources/models.ts +++ b/packages/sdk/ts/src/resources/models.ts @@ -6,8 +6,10 @@ import { GeminiModels, OpenRouterModels, OpenAIImageModels, + OpenAIAudioModels, SupportedModel, SupportedImageModel, + SupportedAudioModel, } from '../supported-models'; export class ModelsResource extends BaseResource { @@ -32,4 +34,8 @@ export class ModelsResource extends BaseResource { async listSupportedImageModels(): Promise { return OpenAIImageModels; } + + async listSupportedAudioModels(): Promise { + return OpenAIAudioModels; + } } diff --git a/packages/sdk/ts/src/supported-models/audio/openai.ts b/packages/sdk/ts/src/supported-models/audio/openai.ts new file mode 100644 index 000000000..ca0fcf3e0 --- /dev/null +++ b/packages/sdk/ts/src/supported-models/audio/openai.ts @@ -0,0 +1,16 @@ +import { SupportedAudioModel } from '../types'; + +export type OpenAIAudioModel = 'whisper-1' | 'whisper-large-v3'; + +export const OpenAIAudioModels: SupportedAudioModel[] = [ + { + model_id: 'whisper-1', + cost_per_minute: 0.006, // $0.006 per minute as per OpenAI pricing + provider: 'OpenAI', + }, + { + model_id: 'whisper-large-v3', + cost_per_minute: 0.006, // $0.006 per minute as per OpenAI pricing + provider: 'OpenAI', + }, +]; \ No newline at end of file diff --git a/packages/sdk/ts/src/supported-models/index.ts b/packages/sdk/ts/src/supported-models/index.ts index d67548322..f6ea89f6c 100644 --- a/packages/sdk/ts/src/supported-models/index.ts +++ b/packages/sdk/ts/src/supported-models/index.ts @@ -6,3 +6,4 @@ export * from './chat/openai'; export * from './chat/openrouter'; export * from './image/openai'; export * from './responses/openai'; +export * from './audio/openai'; diff --git a/packages/sdk/ts/src/supported-models/types.ts b/packages/sdk/ts/src/supported-models/types.ts index ffa3ce2f3..75c495c18 100644 --- a/packages/sdk/ts/src/supported-models/types.ts +++ b/packages/sdk/ts/src/supported-models/types.ts @@ -77,3 +77,11 @@ export interface SupportedTool { | 'per_call' | 'per_gb_per_day'; } + +export interface SupportedAudioModel { + model_id: string; + cost_per_minute: number; + provider: string; +} + +export type SupportedAudioModelType = 'transcription' | 'translation'; diff --git a/packages/tests/provider-smoke/openai-audio-transcription.test.ts b/packages/tests/provider-smoke/openai-audio-transcription.test.ts new file mode 100644 index 000000000..ec2f6a199 --- /dev/null +++ b/packages/tests/provider-smoke/openai-audio-transcription.test.ts @@ -0,0 +1,113 @@ +import { + OpenAIAudioModels, + createEchoOpenAI, +} from '@merit-systems/echo-typescript-sdk'; +import OpenAI, { toFile } from 'openai'; +import { beforeAll, describe, expect, it } from 'vitest'; +import fs from 'fs'; +import path from 'path'; +import { + ECHO_APP_ID, + assertEnv, + baseRouterUrl, + getApiErrorDetails, + getToken, +} from './test-helpers'; + +beforeAll(assertEnv); + +describe.concurrent('OpenAI audio transcription per model', () => { + const testAudioPath = path.join(__dirname, 'test-audio', 'sample.wav'); + + for (const { model_id } of OpenAIAudioModels) { + it(`OpenAI audio transcription ${model_id}`, async () => { + try { + // Verify test audio exists + if (!fs.existsSync(testAudioPath)) { + throw new Error(`Test audio not found at: ${testAudioPath}`); + } + + // Initialize OpenAI client pointing to Echo + const client = new OpenAI({ + apiKey: process.env.ECHO_API_KEY || '', + baseURL: baseRouterUrl, + }); + + // Create file object for OpenAI + const audioFile = await toFile( + fs.createReadStream(testAudioPath), + 'sample.wav', + { + type: 'audio/wav', + } + ); + + // Make transcription request using raw OpenAI API + const response = await client.audio.transcriptions.create({ + model: model_id, + file: audioFile, + response_format: 'json', + }); + + // Verify response + expect(response).toBeDefined(); + expect(response.text).toBeDefined(); + expect(typeof response.text).toBe('string'); + expect(response.text.length).toBeGreaterThan(0); + } catch (err) { + const details = getApiErrorDetails(err); + throw new Error( + `[audioTranscription] OpenAI ${model_id} failed: ${details}` + ); + } + }); + } +}); + +describe.concurrent('OpenAI audio translation per model', () => { + const testAudioPath = path.join(__dirname, 'test-audio', 'sample.wav'); + + for (const { model_id } of OpenAIAudioModels) { + it(`OpenAI audio translation ${model_id}`, async () => { + try { + // Verify test audio exists + if (!fs.existsSync(testAudioPath)) { + throw new Error(`Test audio not found at: ${testAudioPath}`); + } + + // Initialize OpenAI client pointing to Echo + const client = new OpenAI({ + apiKey: process.env.ECHO_API_KEY || '', + baseURL: baseRouterUrl, + }); + + // Create file object for OpenAI + const audioFile = await toFile( + fs.createReadStream(testAudioPath), + 'sample.wav', + { + type: 'audio/wav', + } + ); + + // Make translation request using raw OpenAI API + const response = await client.audio.translations.create({ + model: model_id, + file: audioFile, + response_format: 'json', + }); + + // Verify response + expect(response).toBeDefined(); + expect(response.text).toBeDefined(); + expect(typeof response.text).toBe('string'); + expect(response.text.length).toBeGreaterThan(0); + } catch (err) { + const details = getApiErrorDetails(err); + throw new Error( + `[audioTranslation] OpenAI ${model_id} failed: ${details}` + ); + } + }); + } +}); diff --git a/packages/tests/provider-smoke/test-audio/sample.wav b/packages/tests/provider-smoke/test-audio/sample.wav new file mode 100644 index 000000000..abf4dc7e4 Binary files /dev/null and b/packages/tests/provider-smoke/test-audio/sample.wav differ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e383c6800..bbdb51a84 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -416,6 +416,9 @@ importers: '@types/express': specifier: ^4.17.21 version: 4.17.23 + '@types/form-data': + specifier: ^2.5.2 + version: 2.5.2 '@types/multer': specifier: ^2.0.0 version: 2.0.0 @@ -437,6 +440,9 @@ importers: express: specifier: ^4.18.3 version: 4.21.2 + form-data: + specifier: ^4.0.4 + version: 4.0.4 jose: specifier: ^6.0.11 version: 6.0.11 @@ -448,7 +454,7 @@ importers: version: 2.7.0 openai: specifier: ^4.97.0 - version: 4.104.0(ws@7.5.10(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.76) + version: 4.104.0(ws@8.18.2(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.76) prisma: specifier: 6.16.0 version: 6.16.0(magicast@0.3.5)(typescript@5.8.3) @@ -4121,6 +4127,10 @@ packages: '@types/express@4.17.23': resolution: {integrity: sha512-Crp6WY9aTYP3qPi2wGDo9iUe/rceX01UMhnF1jmwDcKCFM6cx7YhGP/Mpr3y9AASpfHixIG0E6azCcL5OcDHsQ==} + '@types/form-data@2.5.2': + resolution: {integrity: sha512-tfmcyHn1Pp9YHAO5r40+UuZUPAZbUEgqTel3EuEKpmF9hPkXgR4l41853raliXnb4gwyPNoQOfvgGGlHN5WSog==} + deprecated: This is a stub types definition. form-data provides its own type definitions, so you do not need this installed. + '@types/hast@3.0.4': resolution: {integrity: sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==} @@ -6163,8 +6173,8 @@ packages: form-data-encoder@1.7.2: resolution: {integrity: sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==} - form-data@4.0.3: - resolution: {integrity: sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==} + form-data@4.0.4: + resolution: {integrity: sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==} engines: {node: '>= 6'} formdata-node@4.4.1: @@ -14035,6 +14045,10 @@ snapshots: '@types/qs': 6.14.0 '@types/serve-static': 1.15.8 + '@types/form-data@2.5.2': + dependencies: + form-data: 4.0.4 + '@types/hast@3.0.4': dependencies: '@types/unist': 3.0.3 @@ -14083,7 +14097,7 @@ snapshots: '@types/node-fetch@2.6.12': dependencies: '@types/node': 20.19.1 - form-data: 4.0.3 + form-data: 4.0.4 '@types/node@12.20.55': {} @@ -14158,7 +14172,7 @@ snapshots: '@types/cookiejar': 2.1.5 '@types/methods': 1.1.4 '@types/node': 20.19.1 - form-data: 4.0.3 + form-data: 4.0.4 '@types/supertest@6.0.3': dependencies: @@ -15391,7 +15405,7 @@ snapshots: axios@1.10.0: dependencies: follow-redirects: 1.15.9 - form-data: 4.0.3 + form-data: 4.0.4 proxy-from-env: 1.1.0 transitivePeerDependencies: - debug @@ -16798,7 +16812,7 @@ snapshots: form-data-encoder@1.7.2: {} - form-data@4.0.3: + form-data@4.0.4: dependencies: asynckit: 0.4.0 combined-stream: 1.0.8 @@ -18730,7 +18744,7 @@ snapshots: is-docker: 2.2.1 is-wsl: 2.2.0 - openai@4.104.0(ws@7.5.10(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.76): + openai@4.104.0(ws@8.18.2(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.76): dependencies: '@types/node': 18.19.112 '@types/node-fetch': 2.6.12 @@ -18740,7 +18754,7 @@ snapshots: formdata-node: 4.4.1 node-fetch: 2.7.0 optionalDependencies: - ws: 7.5.10(bufferutil@4.0.9)(utf-8-validate@5.0.10) + ws: 8.18.2(bufferutil@4.0.9)(utf-8-validate@5.0.10) zod: 3.25.76 transitivePeerDependencies: - encoding @@ -20145,7 +20159,7 @@ snapshots: cookiejar: 2.1.4 debug: 4.4.1 fast-safe-stringify: 2.1.1 - form-data: 4.0.3 + form-data: 4.0.4 formidable: 2.1.5 methods: 1.1.2 mime: 2.6.0