Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
use OCA\OpenAi\Notification\Notifier;
use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider;
use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider;
use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateProvider;
use OCA\OpenAi\TaskProcessing\AudioToAudioTranslateTaskType;
use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
use OCA\OpenAi\TaskProcessing\ChangeToneTaskType;
Expand Down Expand Up @@ -104,6 +106,8 @@ public function register(IRegistrationContext $context): void {
// Task processing
if ($this->appConfig->getValueString(Application::APP_ID, 'translation_provider_enabled', '1') === '1') {
$context->registerTaskProcessingProvider(TranslateProvider::class);
$context->registerTaskProcessingTaskType(AudioToAudioTranslateTaskType::class);
$context->registerTaskProcessingProvider(AudioToAudioTranslateProvider::class);
Comment on lines +109 to +110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

checks for STT and TTS providers would be nice too

}
if ($this->appConfig->getValueString(Application::APP_ID, 'stt_provider_enabled', '1') === '1') {
$context->registerTaskProcessingProvider(AudioToTextProvider::class);
Expand Down
127 changes: 127 additions & 0 deletions lib/Service/TranslationService.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
<?php

/**
* SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\OpenAi\Service;

use OCP\ICacheFactory;
use OCP\L10N\IFactory;
use Psr\Log\LoggerInterface;

class TranslationService {
public const SYSTEM_PROMPT = 'You are a translations expert that ONLY outputs a valid JSON with the translated text in the following format: { "translation": "<translated text>" } .';
public const JSON_RESPONSE_FORMAT = [
'response_format' => [
'type' => 'json_schema',
'json_schema' => [
'name' => 'TranslationResponse',
'description' => 'A JSON object containing the translated text',
'strict' => true,
'schema' => [
'type' => 'object',
'properties' => [
'translation' => [
'type' => 'string',
'description' => 'The translated text',
],
],
'required' => [ 'translation' ],
'additionalProperties' => false,
],
],
],
];

public function __construct(
private OpenAiSettingsService $openAiSettingsService,
private LoggerInterface $logger,
private OpenAiAPIService $openAiAPIService,
private ChunkService $chunkService,
private ICacheFactory $cacheFactory,
private IFactory $l10nFactory,
) {
}

private function getCoreLanguagesByCode(): array {
$coreL = $this->l10nFactory->getLanguages();
$coreLanguages = array_reduce(array_merge($coreL['commonLanguages'], $coreL['otherLanguages']), function ($carry, $val) {
$carry[$val['code']] = $val['name'];
return $carry;
});
return $coreLanguages;
}

public function translate(
string $inputText, string $sourceLanguageCode, string $targetLanguageCode, string $model, ?int $maxTokens,
?string $userId, ?callable $reportProgress = null,
): string {
$chunks = $this->chunkService->chunkSplitPrompt($inputText, true, $maxTokens);
$translation = '';
$increase = 1.0 / (float)count($chunks);
$progress = 0.0;
$coreLanguages = $this->getCoreLanguagesByCode();

$toLanguage = $coreLanguages[$targetLanguageCode] ?? $targetLanguageCode;

if ($sourceLanguageCode !== 'detect_language') {
$fromLanguage = $coreLanguages[$sourceLanguageCode] ?? $sourceLanguageCode;
$promptStart = 'Translate the following text from ' . $fromLanguage . ' to ' . $toLanguage . ': ';
} else {
$promptStart = 'Translate the following text to ' . $toLanguage . ': ';
}

foreach ($chunks as $chunk) {
$progress += $increase;
$cacheKey = $sourceLanguageCode . '/' . $targetLanguageCode . '/' . md5($chunk);

$cache = $this->cacheFactory->createDistributed('integration_openai');
if ($cached = $cache->get($cacheKey)) {
$this->logger->debug('Using cached translation', ['cached' => $cached, 'cacheKey' => $cacheKey]);
$translation .= $cached;
if ($reportProgress !== null) {
$reportProgress($progress);
}
continue;
}
$prompt = $promptStart . PHP_EOL . PHP_EOL . $chunk;

if ($this->openAiAPIService->isUsingOpenAi() || $this->openAiSettingsService->getChatEndpointEnabled()) {
$completionsObj = $this->openAiAPIService->createChatCompletion(
$userId, $model, $prompt, self::SYSTEM_PROMPT, null, 1, $maxTokens, self::JSON_RESPONSE_FORMAT
);
$completions = $completionsObj['messages'];
} else {
$completions = $this->openAiAPIService->createCompletion(
$userId, $prompt . PHP_EOL . self::SYSTEM_PROMPT . PHP_EOL . PHP_EOL, 1, $model, $maxTokens
);
}

if ($reportProgress !== null) {
$reportProgress($progress);
}

if (count($completions) === 0) {
$this->logger->error('Empty translation response received for chunk');
continue;
}

$completion = array_pop($completions);
$decodedCompletion = json_decode($completion, true);
if (
!isset($decodedCompletion['translation'])
|| !is_string($decodedCompletion['translation'])
|| empty($decodedCompletion['translation'])
) {
$this->logger->error('Invalid translation response received for chunk', ['response' => $completion]);
continue;
}
$translation .= $decodedCompletion['translation'];
$cache->set($cacheKey, $decodedCompletion['translation']);
continue;
}
return $translation;
}
}
195 changes: 195 additions & 0 deletions lib/TaskProcessing/AudioToAudioTranslateProvider.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
<?php

declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\OpenAi\TaskProcessing;

use Exception;
use OCA\OpenAi\AppInfo\Application;
use OCA\OpenAi\Service\OpenAiAPIService;
use OCA\OpenAi\Service\OpenAiSettingsService;
use OCA\OpenAi\Service\TranslationService;
use OCA\OpenAi\Service\WatermarkingService;
use OCP\Files\File;
use OCP\IAppConfig;
use OCP\IL10N;
use OCP\IUserManager;
use OCP\L10N\IFactory;
use OCP\TaskProcessing\Exception\ProcessingException;
use OCP\TaskProcessing\ISynchronousWatermarkingProvider;
use OCP\TaskProcessing\ShapeEnumValue;
use Psr\Log\LoggerInterface;

class AudioToAudioTranslateProvider implements ISynchronousWatermarkingProvider {

public function __construct(
private OpenAiAPIService $openAiAPIService,
private TranslationService $translationService,
private OpenAiSettingsService $openAiSettingsService,
private WatermarkingService $watermarkingService,
private LoggerInterface $logger,
private IFactory $l10nFactory,
private IL10N $l,
private IAppConfig $appConfig,
private IUserManager $userManager,
) {
}

public function getId(): string {
return Application::APP_ID . '-audio2audio:translate';
}

public function getName(): string {
return $this->openAiAPIService->getServiceName(Application::SERVICE_TYPE_STT);
}

public function getTaskTypeId(): string {
return AudioToAudioTranslateTaskType::ID;
}

public function getExpectedRuntime(): int {
return 60;
}

public function getInputShapeEnumValues(): array {
$coreL = $this->l10nFactory->getLanguages();
$languages = array_merge($coreL['commonLanguages'], $coreL['otherLanguages']);
$languageEnumValues = array_map(static function (array $language) {
return new ShapeEnumValue($language['name'], $language['code']);
}, $languages);
$detectLanguageEnumValue = new ShapeEnumValue($this->l->t('Detect language'), 'detect_language');
return [
'origin_language' => array_merge([$detectLanguageEnumValue], $languageEnumValues),
'target_language' => $languageEnumValues,
];
}

public function getInputShapeDefaults(): array {
return [
'origin_language' => 'detect_language',
];
}


public function getOptionalInputShape(): array {
return [];
}
Comment on lines +79 to +81
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wdyt of adding the voice and speed optional params here too, same as the text-to-speech task?

public function getOptionalInputShape(): array {
return [
'voice' => new ShapeDescriptor(
$this->l->t('Voice'),
$this->l->t('The voice to use'),
EShapeType::Enum
),
'model' => new ShapeDescriptor(
$this->l->t('Model'),
$this->l->t('The model used to generate the speech'),
EShapeType::Enum
),
'speed' => new ShapeDescriptor(
$this->l->t('Speed'),
$this->openAiAPIService->isUsingOpenAi(Application::SERVICE_TYPE_TTS)
? $this->l->t('Speech speed modifier (Valid values: 0.25-4)')
: $this->l->t('Speech speed modifier'),
EShapeType::Number
)
];
}


public function getOptionalInputShapeEnumValues(): array {
return [];
}

public function getOptionalInputShapeDefaults(): array {
return [];
}

public function getOutputShapeEnumValues(): array {
return [];
}

public function getOptionalOutputShape(): array {
return [];
}

public function getOptionalOutputShapeEnumValues(): array {
return [];
}

public function process(?string $userId, array $input, callable $reportProgress, bool $includeWatermark = true): array {
if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
throw new ProcessingException('Invalid input file');
}
$inputFile = $input['input'];

if (!isset($input['origin_language']) || !is_string($input['origin_language'])) {
throw new ProcessingException('Invalid origin_language input');
}
if (!isset($input['target_language']) || !is_string($input['target_language'])) {
throw new ProcessingException('Invalid target_language input');
}

// STT
$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID;
try {
$transcription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel, $input['origin_language']);
} catch (Exception $e) {
$this->logger->warning('Transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
throw new ProcessingException(
'Transcription failed with: ' . $e->getMessage(),
$e->getCode(),
$e,
);
}

$reportProgress(0.3);

// translate
$completionModel = $this->openAiAPIService->isUsingOpenAi()
? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID, lazy: true) ?: Application::DEFAULT_MODEL_ID)
: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', lazy: true);
$maxTokens = $this->openAiSettingsService->getMaxTokens();

try {
$translatedText = $this->translationService->translate(
$transcription, $input['origin_language'], $input['target_language'], $completionModel, $maxTokens, $userId,
);

if (empty($translatedText)) {
throw new ProcessingException("Empty translation result from {$input['origin_language']} to {$input['target_language']}");
}
} catch (Exception $e) {
throw new ProcessingException(
"Failed to translate from {$input['origin_language']} to {$input['target_language']}: {$e->getMessage()}",
$e->getCode(),
$e,
);
}

$reportProgress(0.6);

// TTS
$ttsPrompt = $translatedText;
if ($includeWatermark) {
if ($userId !== null) {
$user = $this->userManager->getExistingUser($userId);
$lang = $this->l10nFactory->getUserLanguage($user);
$l = $this->l10nFactory->get(Application::APP_ID, $lang);
$ttsPrompt .= "\n\n" . $l->t('This was generated using Artificial Intelligence.');
} else {
$ttsPrompt .= "\n\n" . $this->l->t('This was generated using Artificial Intelligence.');
}
}
Comment on lines +157 to +166
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can also work but it would add the text/audio in the user's language which may or may not be the target language.

$ttsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id', Application::DEFAULT_SPEECH_MODEL_ID, lazy: true) ?: Application::DEFAULT_SPEECH_MODEL_ID;
$voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE, lazy: true) ?: Application::DEFAULT_SPEECH_VOICE;
$speed = 1;
try {
$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $ttsPrompt, $ttsModel, $voice, $speed);

if (!isset($apiResponse['body'])) {
$this->logger->warning('Text to speech generation failed: no speech returned');
throw new ProcessingException('Text to speech generation failed: no speech returned');
}
$translatedAudio = $includeWatermark ? $this->watermarkingService->markAudio($apiResponse['body']) : $apiResponse['body'];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe better to watermark the transcript of the input audio so the translated text and the translated audio both have the watermark in the target language

} catch (\Exception $e) {
$this->logger->warning('Text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
throw new ProcessingException(
'Text to speech generation failed with: ' . $e->getMessage(),
$e->getCode(),
$e,
);
}

$reportProgress(1.0);

// Translation
return [
'audio_output' => $translatedAudio,
'text_output' => $translatedText,
];
}
}
Loading
Loading