diff --git a/src/backend/src/data/hardcoded-permissions.js b/src/backend/src/data/hardcoded-permissions.js index e9bc3e67..d615ad5f 100644 --- a/src/backend/src/data/hardcoded-permissions.js +++ b/src/backend/src/data/hardcoded-permissions.js @@ -25,6 +25,7 @@ const default_implicit_user_app_permissions = { 'driver:puter-image-generation': {}, 'driver:puter-video-generation': {}, 'driver:puter-tts': {}, + 'driver:puter-speech2speech': {}, 'driver:puter-speech2txt': {}, 'driver:puter-apps': {}, 'driver:puter-subdomains': {}, @@ -61,6 +62,7 @@ const implicit_user_app_permissions = [ 'driver:puter-chat-completion:complete': {}, 'driver:puter-image-generation:generate': {}, 'driver:puter-video-generation:generate': {}, + 'driver:puter-speech2speech:convert': {}, 'driver:puter-speech2txt:transcribe': {}, 'driver:puter-speech2txt:translate': {}, 'driver:puter-analytics:create_trace': {}, diff --git a/src/backend/src/modules/puterai/AIInterfaceService.js b/src/backend/src/modules/puterai/AIInterfaceService.js index eed23a7e..0efb09be 100644 --- a/src/backend/src/modules/puterai/AIInterfaceService.js +++ b/src/backend/src/modules/puterai/AIInterfaceService.js @@ -260,6 +260,37 @@ class AIInterfaceService extends BaseService { }, }); + col_interfaces.set('puter-speech2speech', { + description: 'Speech to speech voice conversion (voice changer).', + methods: { + convert: { + description: 'Convert input audio to a target voice.', + parameters: { + audio: { type: 'file' }, + voice: { type: 'string', optional: true }, + voice_id: { type: 'string', optional: true }, + model: { type: 'string', optional: true }, + output_format: { type: 'string', optional: true }, + voice_settings: { type: 'json', optional: true }, + seed: { type: 'number', optional: true }, + remove_background_noise: { type: 'flag', optional: true }, + file_format: { type: 'string', optional: true }, + optimize_streaming_latency: { type: 'number', optional: true }, + enable_logging: { type: 'flag', optional: true }, + }, + result_choices: [ + { + names: ['audio'], + type: { + $: 'stream', + content_type: 'audio', + }, + }, + ], + }, + }, + }); + col_interfaces.set('puter-speech2txt', { description: 'Speech to text transcription and translation.', methods: { diff --git a/src/backend/src/modules/puterai/ElevenLabsVoiceChangerService.js b/src/backend/src/modules/puterai/ElevenLabsVoiceChangerService.js new file mode 100644 index 00000000..31978c65 --- /dev/null +++ b/src/backend/src/modules/puterai/ElevenLabsVoiceChangerService.js @@ -0,0 +1,296 @@ +/* + * Copyright (C) 2024-present Puter Technologies Inc. + * + * This file is part of Puter. + * + * Puter is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +const { Readable } = require('stream'); +const APIError = require('../../api/APIError'); +const BaseService = require('../../services/BaseService'); +const { TypedValue } = require('../../services/drivers/meta/Runtime'); +const { FileFacade } = require('../../services/drivers/FileFacade'); +const { Context } = require('../../util/context'); + +const DEFAULT_MODEL = 'eleven_multilingual_sts_v2'; +const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; +const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3'; +const MAX_AUDIO_FILE_SIZE = 25 * 1024 * 1024; +const DEFAULT_OUTPUT_FORMAT = 'mp3_44100_128'; + +/** + * ElevenLabs voice changer (speech-to-speech). + */ +class ElevenLabsVoiceChangerService extends BaseService { + /** @type {import('../../services/MeteringService/MeteringService').MeteringService} */ + get meteringService () { + return this.services.get('meteringService').meteringService; + } + + static MODULES = { + mime: require('mime-types'), + musicMetadata: require('music-metadata'), + path: require('path'), + }; + + static IMPLEMENTS = { + ['driver-capabilities']: { + supports_test_mode (iface, method_name) { + return iface === 'puter-speech2speech' && method_name === 'convert'; + }, + }, + ['puter-speech2speech']: { + async convert (params) { + return this.convert(params); + }, + }, + }; + + async _init () { + const svcConfig = this.global_config?.services?.elevenlabs ?? + this.config?.services?.elevenlabs ?? + this.config?.elevenlabs; + + this.apiKey = svcConfig?.apiKey ?? svcConfig?.api_key ?? svcConfig?.key; + this.baseUrl = svcConfig?.baseUrl ?? 'https://api.elevenlabs.io'; + this.defaultVoiceId = svcConfig?.defaultVoiceId ?? svcConfig?.voiceId ?? DEFAULT_VOICE_ID; + this.defaultModelId = svcConfig?.speechToSpeechModelId ?? svcConfig?.stsModelId ?? DEFAULT_MODEL; + + if ( !this.apiKey ) { + throw new Error('ElevenLabs API key not configured'); + } + } + + async convert (params) { + const { + audio, + voice, + voice_id, + voiceId, + model, + model_id, + voice_settings, + voiceSettings, + seed, + remove_background_noise, + output_format, + file_format, + optimize_streaming_latency, + enable_logging, + test_mode, + } = params ?? {}; + + if ( test_mode ) { + return new TypedValue({ + $: 'string:url:web', + content_type: 'audio', + }, SAMPLE_AUDIO_URL); + } + + if ( !audio ) { + throw APIError.create('field_required', null, { key: 'audio' }); + } + + if ( !(audio instanceof FileFacade) ) { + throw APIError.create('field_invalid', null, { + key: 'audio', + expected: 'file reference', + }); + } + + const { + buffer, + filename, + mimeType, + estimatedSeconds, + } = await this._prepareAudioBuffer(audio); + + const modelId = model_id || model || this.defaultModelId || DEFAULT_MODEL; + const selectedVoiceId = voice_id || voiceId || voice || this.defaultVoiceId; + + if ( !selectedVoiceId ) { + throw APIError.create('field_required', null, { key: 'voice' }); + } + + const actor = Context.get('actor'); + const usageKey = `elevenlabs:${modelId}:second`; + const usageAllowed = await this.meteringService.hasEnoughCreditsFor(actor, usageKey, estimatedSeconds); + if ( !usageAllowed ) { + throw APIError.create('insufficient_funds'); + } + + const formData = new FormData(); + const blob = new Blob([buffer], { type: mimeType || 'application/octet-stream' }); + formData.append('audio', blob, filename); + formData.append('model_id', modelId); + + const mergedVoiceSettings = voice_settings ?? voiceSettings; + if ( mergedVoiceSettings !== undefined && mergedVoiceSettings !== null ) { + const serializedSettings = typeof mergedVoiceSettings === 'string' + ? mergedVoiceSettings + : JSON.stringify(mergedVoiceSettings); + formData.append('voice_settings', serializedSettings); + } + + if ( seed !== undefined && seed !== null ) { + formData.append('seed', seed); + } + + if ( typeof remove_background_noise === 'boolean' ) { + formData.append('remove_background_noise', String(remove_background_noise)); + } + + if ( file_format ) { + formData.append('file_format', file_format); + } + + const searchParams = new URLSearchParams(); + const desiredOutputFormat = output_format || DEFAULT_OUTPUT_FORMAT; + if ( desiredOutputFormat ) { + searchParams.set('output_format', desiredOutputFormat); + } + if ( optimize_streaming_latency !== undefined && optimize_streaming_latency !== null ) { + searchParams.set('optimize_streaming_latency', optimize_streaming_latency); + } + if ( enable_logging !== undefined && enable_logging !== null ) { + searchParams.set('enable_logging', enable_logging); + } + + const url = new URL(`/v1/speech-to-speech/${selectedVoiceId}`, this.baseUrl); + const search = searchParams.toString(); + if ( search ) { + url.search = search; + } + + const response = await fetch(url, { + method: 'POST', + headers: { + 'xi-api-key': this.apiKey, + }, + body: formData, + }); + + if ( !response.ok ) { + let detail = null; + try { + detail = await response.json(); + } catch ( e ) { + // ignore + } + this.log.error('ElevenLabs voice changer request failed', { + status: response.status, + detail, + }); + throw APIError.create('internal_server_error', null, { + provider: 'elevenlabs', + status: response.status, + }); + } + + const arrayBuffer = await response.arrayBuffer(); + const responseBuffer = Buffer.from(arrayBuffer); + const stream = Readable.from(responseBuffer); + + this.meteringService.incrementUsage(actor, usageKey, estimatedSeconds); + + return new TypedValue({ + $: 'stream', + content_type: response.headers.get('content-type') || 'audio/mpeg', + }, stream); + } + + async _prepareAudioBuffer (file) { + const buffer = await file.get('buffer'); + if ( !buffer || !buffer.length ) { + throw APIError.create('field_invalid', null, { + key: 'audio', + expected: 'non-empty audio file', + }); + } + + if ( buffer.length > MAX_AUDIO_FILE_SIZE ) { + throw APIError.create('file_too_large', null, { + max_size: MAX_AUDIO_FILE_SIZE, + }); + } + + let filename = 'audio'; + let mimeType; + + const pathValue = await file.get('path'); + if ( pathValue ) { + filename = this.modules.path.basename(pathValue); + } else { + const url = await file.get('web_url'); + if ( url ) { + try { + const parsed = new URL(url); + const candidate = this.modules.path.basename(parsed.pathname); + if ( candidate ) filename = candidate; + } catch (_) { + // Ignore URL parsing errors; we'll fall back to defaults. + } + } + } + + const dataUrl = await file.get('data_url'); + if ( dataUrl ) { + const match = /^data:([^;,]+)[;,]/.exec(dataUrl); + if ( match ) { + mimeType = match[1]; + } + } + + if ( ! mimeType ) { + const guessedMime = this.modules.mime.lookup(filename); + if ( guessedMime ) { + mimeType = guessedMime; + } + } + + if ( ! filename.includes('.') ) { + const extension = mimeType ? this.modules.mime.extension(mimeType) : 'mp3'; + filename = `${filename}.${extension || 'mp3'}`; + } + + let estimatedSeconds = Math.ceil(buffer.length / 16000); + try { + const metadata = await this.modules.musicMetadata.parseBuffer(buffer, { + mimeType, + size: buffer.length, + }); + if ( metadata?.format?.duration ) { + estimatedSeconds = Math.ceil(metadata.format.duration); + } + } catch (e) { + if ( process.env.DEBUG_AUDIO_METADATA === '1' ) { + console.warn('Failed to parse audio metadata for duration estimation:', e.message); + } + } + + estimatedSeconds = Math.max(1, estimatedSeconds); + + return { + buffer, + filename, + mimeType, + estimatedSeconds, + }; + } +} + +module.exports = { + ElevenLabsVoiceChangerService, +}; diff --git a/src/backend/src/modules/puterai/PuterAIModule.js b/src/backend/src/modules/puterai/PuterAIModule.js index d2dc35a1..9ac03472 100644 --- a/src/backend/src/modules/puterai/PuterAIModule.js +++ b/src/backend/src/modules/puterai/PuterAIModule.js @@ -58,6 +58,9 @@ class PuterAIModule extends AdvancedBase { if ( config?.services?.['elevenlabs'] || config?.elevenlabs ) { const { ElevenLabsTTSService } = require('./ElevenLabsTTSService'); services.registerService('elevenlabs-tts', ElevenLabsTTSService); + + const { ElevenLabsVoiceChangerService } = require('./ElevenLabsVoiceChangerService'); + services.registerService('elevenlabs-voice-changer', ElevenLabsVoiceChangerService); } if ( config?.services?.openai || config?.openai ) { diff --git a/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts b/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts index f3be1f4e..0562b0ce 100644 --- a/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts +++ b/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts @@ -10,4 +10,6 @@ export const ELEVENLABS_COST_MAP = { 'elevenlabs:eleven_turbo_v2_5:character': 11, 'elevenlabs:eleven_flash_v2_5:character': 5.5, 'elevenlabs:eleven_v3:character': 11, + 'elevenlabs:eleven_multilingual_sts_v2:second': 11, + 'elevenlabs:eleven_english_sts_v2:second': 11, }; diff --git a/src/backend/src/services/drivers/DriverService.js b/src/backend/src/services/drivers/DriverService.js index 81d78274..7f7adeba 100644 --- a/src/backend/src/services/drivers/DriverService.js +++ b/src/backend/src/services/drivers/DriverService.js @@ -287,6 +287,7 @@ class DriverService extends BaseService { const iface_to_driver = { ['puter-ocr']: 'aws-textract', ['puter-tts']: 'aws-polly', + ['puter-speech2speech']: 'elevenlabs-voice-changer', ['puter-speech2txt']: 'openai-speech2txt', ['puter-chat-completion']: 'openai-completion', ['puter-image-generation']: 'openai-image-generation', diff --git a/src/puter-js/src/modules/AI.js b/src/puter-js/src/modules/AI.js index de5ff201..13f58db7 100644 --- a/src/puter-js/src/modules/AI.js +++ b/src/puter-js/src/modules/AI.js @@ -386,6 +386,118 @@ class AI { }).call(this, options); }; + speech2speech = async (...args) => { + const MAX_INPUT_SIZE = 25 * 1024 * 1024; + if ( !args || !args.length ) { + throw ({ message: 'Arguments are required', code: 'arguments_required' }); + } + + const normalizeSource = async (value) => { + if ( value instanceof Blob ) { + return await utils.blobToDataUri(value); + } + return value; + }; + + const normalizeOptions = (opts = {}) => { + const normalized = { ...opts }; + if ( normalized.voiceId && !normalized.voice && !normalized.voice_id ) normalized.voice = normalized.voiceId; + if ( normalized.modelId && !normalized.model && !normalized.model_id ) normalized.model = normalized.modelId; + if ( normalized.outputFormat && !normalized.output_format ) normalized.output_format = normalized.outputFormat; + if ( normalized.voiceSettings && !normalized.voice_settings ) normalized.voice_settings = normalized.voiceSettings; + if ( normalized.fileFormat && !normalized.file_format ) normalized.file_format = normalized.fileFormat; + if ( normalized.removeBackgroundNoise !== undefined && normalized.remove_background_noise === undefined ) { + normalized.remove_background_noise = normalized.removeBackgroundNoise; + } + if ( normalized.optimizeStreamingLatency !== undefined && normalized.optimize_streaming_latency === undefined ) { + normalized.optimize_streaming_latency = normalized.optimizeStreamingLatency; + } + if ( normalized.enableLogging !== undefined && normalized.enable_logging === undefined ) { + normalized.enable_logging = normalized.enableLogging; + } + delete normalized.voiceId; + delete normalized.modelId; + delete normalized.outputFormat; + delete normalized.voiceSettings; + delete normalized.fileFormat; + delete normalized.removeBackgroundNoise; + delete normalized.optimizeStreamingLatency; + delete normalized.enableLogging; + return normalized; + }; + + let options = {}; + let testMode = false; + + const primary = args[0]; + if ( primary && typeof primary === 'object' && !Array.isArray(primary) && !(primary instanceof Blob) ) { + options = { ...primary }; + } else { + options.audio = await normalizeSource(primary); + } + + if ( args[1] && typeof args[1] === 'object' && !Array.isArray(args[1]) && !(args[1] instanceof Blob) ) { + options = { ...options, ...args[1] }; + } else if ( typeof args[1] === 'boolean' ) { + testMode = args[1]; + } + + if ( typeof args[2] === 'boolean' ) { + testMode = args[2]; + } + + if ( options.file ) { + options.audio = await normalizeSource(options.file); + delete options.file; + } + + if ( options.audio instanceof Blob ) { + options.audio = await normalizeSource(options.audio); + } + + if ( ! options.audio ) { + throw { message: 'Audio input is required', code: 'audio_required' }; + } + + if ( typeof options.audio === 'string' && options.audio.startsWith('data:') ) { + const base64 = options.audio.split(',')[1] || ''; + const padding = base64.endsWith('==') ? 2 : (base64.endsWith('=') ? 1 : 0); + const byteLength = Math.floor((base64.length * 3) / 4) - padding; + if ( byteLength > MAX_INPUT_SIZE ) { + throw { message: 'Input size cannot be larger than 25 MB', code: 'input_too_large' }; + } + } + + const driverArgs = normalizeOptions({ ...options }); + delete driverArgs.provider; + + return await utils.make_driver_method(['audio'], 'puter-speech2speech', 'elevenlabs-voice-changer', 'convert', { + responseType: 'blob', + test_mode: testMode, + transform: async (result) => { + let url; + if ( typeof result === 'string' ) { + url = result; + } else if ( result instanceof Blob ) { + url = await utils.blob_to_url(result); + } else if ( result instanceof ArrayBuffer ) { + const blob = new Blob([result]); + url = await utils.blob_to_url(blob); + } else if ( result && typeof result === 'object' && typeof result.arrayBuffer === 'function' ) { + const arrayBuffer = await result.arrayBuffer(); + const blob = new Blob([arrayBuffer], { type: result.type || undefined }); + url = await utils.blob_to_url(blob); + } else { + throw { message: 'Unexpected audio response format', code: 'invalid_audio_response' }; + } + const audio = new Audio(url); + audio.toString = () => url; + audio.valueOf = () => url; + return audio; + }, + }).call(this, driverArgs); + }; + speech2txt = async (...args) => { const MAX_INPUT_SIZE = 25 * 1024 * 1024; if ( !args || !args.length ) {