Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ export class DiscordBot extends EventTarget {
const status = await discordBotClient.status();
if (signal.aborted) return;

if (status.error) {
throw new Error(`Discord connection error: ${status.error}`);
}

console.log('discord connect 2');
let connectableChannels = status.channels
.filter((channel: any) => [0, 2].includes(channel.type));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { transcribeRealtime } from '../util/audio-perception.mjs';
import { transcribeRealtime, transcribeRealtimeSTT } from '../util/audio-perception.mjs';
import { resample } from 'codecs/resample.mjs';
import { AudioChunker } from '../util/audio-chunker.mjs';

Expand Down Expand Up @@ -112,4 +112,102 @@ export class TranscribedVoiceInput extends EventTarget {
// data: null,
// }));
}
}

export class RealtimeTranscribedVoiceInput extends EventTarget {
static transcribeSampleRate = 16000;
abortController;

constructor({
audioInput, // EventEmitter
sampleRate,
codecs,
}) {
if (!audioInput) {
throw new Error('no audio input');
}
if (!sampleRate) {
throw new Error('no sample rate');
}
if (!codecs) {
throw new Error('no codecs');
}

super();

this.abortController = new AbortController();
const {
signal,
} = this.abortController;

(async () => {
const transcription = await transcribeRealtimeSTT({
sampleRate: RealtimeTranscribedVoiceInput.transcribeSampleRate,
});

// Forward all relevant events
transcription.addEventListener('partial', e => {
console.log('got partial transcription', e);
});

transcription.addEventListener('transcription', e => {
console.log('got full transcription', e);
this.dispatchEvent(new MessageEvent('transcription', {
data: {
transcript: e.data.transcript,
},
}));
});

signal.addEventListener('abort', () => {
transcription.close();
});

const openPromise = new Promise((accept, reject) => {
transcription.addEventListener('open', e => {
accept(null);
});
transcription.addEventListener('error', e => {
reject(e);
});
});

const audioChunker = new AudioChunker({
sampleRate: RealtimeTranscribedVoiceInput.transcribeSampleRate,
chunkSize: 1536,
});

const ondata = async (f32) => {
await openPromise;

// resample if needed
if (sampleRate !== RealtimeTranscribedVoiceInput.transcribeSampleRate) {
f32 = resample(f32, sampleRate, RealtimeTranscribedVoiceInput.transcribeSampleRate);
}

const frames = audioChunker.write(f32);
for (const frame of frames) {
transcription.write(frame);
}
};
audioInput.on('data', ondata);

const onend = () => {
this.close();
};
audioInput.on('end', onend);

const cleanup = () => {
audioInput.removeListener('data', ondata);
audioInput.removeListener('end', onend);
};
signal.addEventListener('abort', () => {
cleanup();
});
})();
}

close() {
this.abortController.abort();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
discordBotEndpointUrl,
} from '../../util/endpoints.mjs';
import {
RealtimeTranscribedVoiceInput,
TranscribedVoiceInput,
} from '../../devices/audio-transcriber.mjs';

Expand Down Expand Up @@ -243,7 +244,14 @@ export class DiscordOutput extends EventTarget {
this.userStreams.delete(userId);
});

const transcribedVoiceInput = new TranscribedVoiceInput({
// const transcribedVoiceInput = new TranscribedVoiceInput({
// audioInput: userStream,
// sampleRate,
// codecs,
// jwt,
// });

const transcribedVoiceInput = new RealtimeTranscribedVoiceInput({
audioInput: userStream,
sampleRate,
codecs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import { AudioEncodeStream } from 'codecs/audio-encode.mjs';
import { QueueManager } from 'queue-manager';
import {
aiHost,
realtimeSTTHost,
realtimeSTTControlPort,
realtimeSTTDataPort,
} from './endpoints.mjs';

const defaultTranscriptionModel = 'whisper-1';
Expand Down Expand Up @@ -388,4 +391,115 @@ export const transcribeRealtime = ({
console.warn('error creating transcription', error);
throw error;
}
};

export const transcribeRealtimeSTT = async ({ sampleRate }) => {
if (!sampleRate) {
throw new Error('no sample rate');
}

try {
// const controlUrl = `ws://${serverConfig.host}:${serverConfig.controlPort}`;
// const dataUrl = `ws://${serverConfig.host}:${serverConfig.dataPort}`;

const controlUrl = `wss://${realtimeSTTHost}-${realtimeSTTControlPort}.proxy.runpod.net`;
const dataUrl = `wss://${realtimeSTTHost}-${realtimeSTTDataPort}.proxy.runpod.net`;


const controlSocket = new WebSocket(controlUrl);
const dataSocket = new WebSocket(dataUrl);

const transcription = new EventTarget();

controlSocket.addEventListener('open', () => {
console.log('Control socket connected');
const configs = [
{
command: "set_parameter",
parameter: "language",
value: "en"
},
];
configs.forEach(config => {
controlSocket.send(JSON.stringify(config));
console.log(`Configured ${config.parameter}: ${config.value}`);
});
transcription.dispatchEvent(new MessageEvent('open', { data: null }));
});

controlSocket.addEventListener('message', (data) => {
console.log('Control message:', data);
});

dataSocket.addEventListener('open', () => {
console.log('Data socket connected');
});

dataSocket.addEventListener('message', (data) => {
try {
const result = JSON.parse(data.data);
if (result.type === 'realtime') {
transcription.dispatchEvent(new MessageEvent('partial', {
data: {
transcript: result.text,
},
}));
} else if (result.type === 'fullSentence') {
transcription.dispatchEvent(new MessageEvent('transcription', {
data: {
transcript: result.text,
},
}));
}
} catch (err) {
console.log('Raw message:', data.data);
}
});

controlSocket.addEventListener('error', (e) => {
console.warn('Control socket error:', e);
transcription.dispatchEvent(new MessageEvent('error', { data: e }));
});

dataSocket.addEventListener('error', (e) => {
console.warn('Data socket error:', e);
transcription.dispatchEvent(new MessageEvent('error', { data: e }));
});

transcription.write = (f32) => {
const i16 = floatTo16Bit(f32);
const metadata = {
sampleRate: sampleRate,
channels: 1,
encoding: 'PCM16'
};
const metadataStr = JSON.stringify(metadata);
const metadataLength = Buffer.alloc(4);
metadataLength.writeUInt32LE(metadataStr.length);

const message = Buffer.concat([
metadataLength,
Buffer.from(metadataStr),
Buffer.from(i16.buffer)
]);

if (dataSocket.readyState === WebSocket.OPEN) {
dataSocket.send(message);
}
};

transcription.close = () => {
if (controlSocket.readyState === WebSocket.OPEN) {
controlSocket.close();
}
if (dataSocket.readyState === WebSocket.OPEN) {
dataSocket.close();
}
};

return transcription;
} catch (error) {
console.warn('error creating STT transcription:', error);
throw error;
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,8 @@ export const chatEndpointUrl = `https://chat.upstreet.ai`;
export const discordBotEndpointUrl = `https://discord-bot-upstreet.fly.dev`;
export const telnyxEndpointUrl = 'https://telnyx.isekaichat.workers.dev';
export const workersHost = `isekaichat.workers.dev`;
export const usdkDiscordUrl = `https://upstreet.ai/usdk-discord`;
export const usdkDiscordUrl = `https://upstreet.ai/usdk-discord`;

export const realtimeSTTHost = `g8cgs1qznz533i`;
export const realtimeSTTControlPort = 8011;
export const realtimeSTTDataPort = 8012;