diff --git a/packages/usdk/packages/upstreet-agent/packages/react-agents/classes/discord-manager.ts b/packages/usdk/packages/upstreet-agent/packages/react-agents/classes/discord-manager.ts index 65b52c24b..d5f1d468b 100644 --- a/packages/usdk/packages/upstreet-agent/packages/react-agents/classes/discord-manager.ts +++ b/packages/usdk/packages/upstreet-agent/packages/react-agents/classes/discord-manager.ts @@ -169,6 +169,10 @@ export class DiscordBot extends EventTarget { const status = await discordBotClient.status(); if (signal.aborted) return; + if (status.error) { + throw new Error(`Discord connection error: ${status.error}`); + } + console.log('discord connect 2'); let connectableChannels = status.channels .filter((channel: any) => [0, 2].includes(channel.type)); diff --git a/packages/usdk/packages/upstreet-agent/packages/react-agents/devices/audio-transcriber.mjs b/packages/usdk/packages/upstreet-agent/packages/react-agents/devices/audio-transcriber.mjs index cdfc304f6..bac0687aa 100644 --- a/packages/usdk/packages/upstreet-agent/packages/react-agents/devices/audio-transcriber.mjs +++ b/packages/usdk/packages/upstreet-agent/packages/react-agents/devices/audio-transcriber.mjs @@ -1,4 +1,4 @@ -import { transcribeRealtime } from '../util/audio-perception.mjs'; +import { transcribeRealtime, transcribeRealtimeSTT } from '../util/audio-perception.mjs'; import { resample } from 'codecs/resample.mjs'; import { AudioChunker } from '../util/audio-chunker.mjs'; @@ -112,4 +112,102 @@ export class TranscribedVoiceInput extends EventTarget { // data: null, // })); } +} + +export class RealtimeTranscribedVoiceInput extends EventTarget { + static transcribeSampleRate = 16000; + abortController; + + constructor({ + audioInput, // EventEmitter + sampleRate, + codecs, + }) { + if (!audioInput) { + throw new Error('no audio input'); + } + if (!sampleRate) { + throw new Error('no sample rate'); + } + if (!codecs) { + throw new Error('no codecs'); + } + + super(); + + this.abortController = new AbortController(); + const { + signal, + } = this.abortController; + + (async () => { + const transcription = await transcribeRealtimeSTT({ + sampleRate: RealtimeTranscribedVoiceInput.transcribeSampleRate, + }); + + // Forward all relevant events + transcription.addEventListener('partial', e => { + console.log('got partial transcription', e); + }); + + transcription.addEventListener('transcription', e => { + console.log('got full transcription', e); + this.dispatchEvent(new MessageEvent('transcription', { + data: { + transcript: e.data.transcript, + }, + })); + }); + + signal.addEventListener('abort', () => { + transcription.close(); + }); + + const openPromise = new Promise((accept, reject) => { + transcription.addEventListener('open', e => { + accept(null); + }); + transcription.addEventListener('error', e => { + reject(e); + }); + }); + + const audioChunker = new AudioChunker({ + sampleRate: RealtimeTranscribedVoiceInput.transcribeSampleRate, + chunkSize: 1536, + }); + + const ondata = async (f32) => { + await openPromise; + + // resample if needed + if (sampleRate !== RealtimeTranscribedVoiceInput.transcribeSampleRate) { + f32 = resample(f32, sampleRate, RealtimeTranscribedVoiceInput.transcribeSampleRate); + } + + const frames = audioChunker.write(f32); + for (const frame of frames) { + transcription.write(frame); + } + }; + audioInput.on('data', ondata); + + const onend = () => { + this.close(); + }; + audioInput.on('end', onend); + + const cleanup = () => { + audioInput.removeListener('data', ondata); + audioInput.removeListener('end', onend); + }; + signal.addEventListener('abort', () => { + cleanup(); + }); + })(); + } + + close() { + this.abortController.abort(); + } } \ No newline at end of file diff --git a/packages/usdk/packages/upstreet-agent/packages/react-agents/lib/discord/discord-client.js b/packages/usdk/packages/upstreet-agent/packages/react-agents/lib/discord/discord-client.js index bd467de2b..63841309d 100644 --- a/packages/usdk/packages/upstreet-agent/packages/react-agents/lib/discord/discord-client.js +++ b/packages/usdk/packages/upstreet-agent/packages/react-agents/lib/discord/discord-client.js @@ -20,6 +20,7 @@ import { discordBotEndpointUrl, } from '../../util/endpoints.mjs'; import { + RealtimeTranscribedVoiceInput, TranscribedVoiceInput, } from '../../devices/audio-transcriber.mjs'; @@ -243,7 +244,14 @@ export class DiscordOutput extends EventTarget { this.userStreams.delete(userId); }); - const transcribedVoiceInput = new TranscribedVoiceInput({ + // const transcribedVoiceInput = new TranscribedVoiceInput({ + // audioInput: userStream, + // sampleRate, + // codecs, + // jwt, + // }); + + const transcribedVoiceInput = new RealtimeTranscribedVoiceInput({ audioInput: userStream, sampleRate, codecs, diff --git a/packages/usdk/packages/upstreet-agent/packages/react-agents/util/audio-perception.mjs b/packages/usdk/packages/upstreet-agent/packages/react-agents/util/audio-perception.mjs index be879f390..7e66d0c9d 100644 --- a/packages/usdk/packages/upstreet-agent/packages/react-agents/util/audio-perception.mjs +++ b/packages/usdk/packages/upstreet-agent/packages/react-agents/util/audio-perception.mjs @@ -7,6 +7,9 @@ import { AudioEncodeStream } from 'codecs/audio-encode.mjs'; import { QueueManager } from 'queue-manager'; import { aiHost, + realtimeSTTHost, + realtimeSTTControlPort, + realtimeSTTDataPort, } from './endpoints.mjs'; const defaultTranscriptionModel = 'whisper-1'; @@ -388,4 +391,115 @@ export const transcribeRealtime = ({ console.warn('error creating transcription', error); throw error; } +}; + +export const transcribeRealtimeSTT = async ({ sampleRate }) => { + if (!sampleRate) { + throw new Error('no sample rate'); + } + + try { + // const controlUrl = `ws://${serverConfig.host}:${serverConfig.controlPort}`; + // const dataUrl = `ws://${serverConfig.host}:${serverConfig.dataPort}`; + + const controlUrl = `wss://${realtimeSTTHost}-${realtimeSTTControlPort}.proxy.runpod.net`; + const dataUrl = `wss://${realtimeSTTHost}-${realtimeSTTDataPort}.proxy.runpod.net`; + + + const controlSocket = new WebSocket(controlUrl); + const dataSocket = new WebSocket(dataUrl); + + const transcription = new EventTarget(); + + controlSocket.addEventListener('open', () => { + console.log('Control socket connected'); + const configs = [ + { + command: "set_parameter", + parameter: "language", + value: "en" + }, + ]; + configs.forEach(config => { + controlSocket.send(JSON.stringify(config)); + console.log(`Configured ${config.parameter}: ${config.value}`); + }); + transcription.dispatchEvent(new MessageEvent('open', { data: null })); + }); + + controlSocket.addEventListener('message', (data) => { + console.log('Control message:', data); + }); + + dataSocket.addEventListener('open', () => { + console.log('Data socket connected'); + }); + + dataSocket.addEventListener('message', (data) => { + try { + const result = JSON.parse(data.data); + if (result.type === 'realtime') { + transcription.dispatchEvent(new MessageEvent('partial', { + data: { + transcript: result.text, + }, + })); + } else if (result.type === 'fullSentence') { + transcription.dispatchEvent(new MessageEvent('transcription', { + data: { + transcript: result.text, + }, + })); + } + } catch (err) { + console.log('Raw message:', data.data); + } + }); + + controlSocket.addEventListener('error', (e) => { + console.warn('Control socket error:', e); + transcription.dispatchEvent(new MessageEvent('error', { data: e })); + }); + + dataSocket.addEventListener('error', (e) => { + console.warn('Data socket error:', e); + transcription.dispatchEvent(new MessageEvent('error', { data: e })); + }); + + transcription.write = (f32) => { + const i16 = floatTo16Bit(f32); + const metadata = { + sampleRate: sampleRate, + channels: 1, + encoding: 'PCM16' + }; + const metadataStr = JSON.stringify(metadata); + const metadataLength = Buffer.alloc(4); + metadataLength.writeUInt32LE(metadataStr.length); + + const message = Buffer.concat([ + metadataLength, + Buffer.from(metadataStr), + Buffer.from(i16.buffer) + ]); + + if (dataSocket.readyState === WebSocket.OPEN) { + dataSocket.send(message); + } + }; + + transcription.close = () => { + if (controlSocket.readyState === WebSocket.OPEN) { + controlSocket.close(); + } + if (dataSocket.readyState === WebSocket.OPEN) { + dataSocket.close(); + } + }; + + return transcription; + } catch (error) { + console.warn('error creating STT transcription:', error); + throw error; + } }; \ No newline at end of file diff --git a/packages/usdk/packages/upstreet-agent/packages/react-agents/util/endpoints.mjs b/packages/usdk/packages/upstreet-agent/packages/react-agents/util/endpoints.mjs index e602d0b91..7d2aa8c51 100644 --- a/packages/usdk/packages/upstreet-agent/packages/react-agents/util/endpoints.mjs +++ b/packages/usdk/packages/upstreet-agent/packages/react-agents/util/endpoints.mjs @@ -8,4 +8,8 @@ export const chatEndpointUrl = `https://chat.upstreet.ai`; export const discordBotEndpointUrl = `https://discord-bot-upstreet.fly.dev`; export const telnyxEndpointUrl = 'https://telnyx.isekaichat.workers.dev'; export const workersHost = `isekaichat.workers.dev`; -export const usdkDiscordUrl = `https://upstreet.ai/usdk-discord`; \ No newline at end of file +export const usdkDiscordUrl = `https://upstreet.ai/usdk-discord`; + +export const realtimeSTTHost = `g8cgs1qznz533i`; +export const realtimeSTTControlPort = 8011; +export const realtimeSTTDataPort = 8012;