Add 11labs speech-to-speech voice changer integration (#2006)

2026-01-06 21:20:27 -06:00 · 2025-11-21 23:24:43 -08:00
parent 262b359633
commit 133e3dc77c
7 changed files with 447 additions and 0 deletions
--- a/src/backend/src/data/hardcoded-permissions.js
+++ b/src/backend/src/data/hardcoded-permissions.js
@@ -25,6 +25,7 @@ const default_implicit_user_app_permissions = {
    'driver:puter-image-generation': {},
    'driver:puter-video-generation': {},
    'driver:puter-tts': {},
+    'driver:puter-speech2speech': {},
    'driver:puter-speech2txt': {},
    'driver:puter-apps': {},
    'driver:puter-subdomains': {},
@@ -61,6 +62,7 @@ const implicit_user_app_permissions = [
            'driver:puter-chat-completion:complete': {},
            'driver:puter-image-generation:generate': {},
            'driver:puter-video-generation:generate': {},
+            'driver:puter-speech2speech:convert': {},
            'driver:puter-speech2txt:transcribe': {},
            'driver:puter-speech2txt:translate': {},
            'driver:puter-analytics:create_trace': {},
--- a/src/backend/src/modules/puterai/AIInterfaceService.js
+++ b/src/backend/src/modules/puterai/AIInterfaceService.js
@@ -260,6 +260,37 @@ class AIInterfaceService extends BaseService {
            },
        });

+        col_interfaces.set('puter-speech2speech', {
+            description: 'Speech to speech voice conversion (voice changer).',
+            methods: {
+                convert: {
+                    description: 'Convert input audio to a target voice.',
+                    parameters: {
+                        audio: { type: 'file' },
+                        voice: { type: 'string', optional: true },
+                        voice_id: { type: 'string', optional: true },
+                        model: { type: 'string', optional: true },
+                        output_format: { type: 'string', optional: true },
+                        voice_settings: { type: 'json', optional: true },
+                        seed: { type: 'number', optional: true },
+                        remove_background_noise: { type: 'flag', optional: true },
+                        file_format: { type: 'string', optional: true },
+                        optimize_streaming_latency: { type: 'number', optional: true },
+                        enable_logging: { type: 'flag', optional: true },
+                    },
+                    result_choices: [
+                        {
+                            names: ['audio'],
+                            type: {
+                                $: 'stream',
+                                content_type: 'audio',
+                            },
+                        },
+                    ],
+                },
+            },
+        });
+
        col_interfaces.set('puter-speech2txt', {
            description: 'Speech to text transcription and translation.',
            methods: {
--- a/src/backend/src/modules/puterai/ElevenLabsVoiceChangerService.js
+++ b/src/backend/src/modules/puterai/ElevenLabsVoiceChangerService.js
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2024-present Puter Technologies Inc.
+ *
+ * This file is part of Puter.
+ *
+ * Puter is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+const { Readable } = require('stream');
+const APIError = require('../../api/APIError');
+const BaseService = require('../../services/BaseService');
+const { TypedValue } = require('../../services/drivers/meta/Runtime');
+const { FileFacade } = require('../../services/drivers/FileFacade');
+const { Context } = require('../../util/context');
+
+const DEFAULT_MODEL = 'eleven_multilingual_sts_v2';
+const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM';
+const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3';
+const MAX_AUDIO_FILE_SIZE = 25 * 1024 * 1024;
+const DEFAULT_OUTPUT_FORMAT = 'mp3_44100_128';
+
+/**
+ * ElevenLabs voice changer (speech-to-speech).
+ */
+class ElevenLabsVoiceChangerService extends BaseService {
+    /** @type {import('../../services/MeteringService/MeteringService').MeteringService} */
+    get meteringService () {
+        return this.services.get('meteringService').meteringService;
+    }
+
+    static MODULES = {
+        mime: require('mime-types'),
+        musicMetadata: require('music-metadata'),
+        path: require('path'),
+    };
+
+    static IMPLEMENTS = {
+        ['driver-capabilities']: {
+            supports_test_mode (iface, method_name) {
+                return iface === 'puter-speech2speech' && method_name === 'convert';
+            },
+        },
+        ['puter-speech2speech']: {
+            async convert (params) {
+                return this.convert(params);
+            },
+        },
+    };
+
+    async _init () {
+        const svcConfig = this.global_config?.services?.elevenlabs ??
+            this.config?.services?.elevenlabs ??
+            this.config?.elevenlabs;
+
+        this.apiKey = svcConfig?.apiKey ?? svcConfig?.api_key ?? svcConfig?.key;
+        this.baseUrl = svcConfig?.baseUrl ?? 'https://api.elevenlabs.io';
+        this.defaultVoiceId = svcConfig?.defaultVoiceId ?? svcConfig?.voiceId ?? DEFAULT_VOICE_ID;
+        this.defaultModelId = svcConfig?.speechToSpeechModelId ?? svcConfig?.stsModelId ?? DEFAULT_MODEL;
+
+        if ( !this.apiKey ) {
+            throw new Error('ElevenLabs API key not configured');
+        }
+    }
+
+    async convert (params) {
+        const {
+            audio,
+            voice,
+            voice_id,
+            voiceId,
+            model,
+            model_id,
+            voice_settings,
+            voiceSettings,
+            seed,
+            remove_background_noise,
+            output_format,
+            file_format,
+            optimize_streaming_latency,
+            enable_logging,
+            test_mode,
+        } = params ?? {};
+
+        if ( test_mode ) {
+            return new TypedValue({
+                $: 'string:url:web',
+                content_type: 'audio',
+            }, SAMPLE_AUDIO_URL);
+        }
+
+        if ( !audio ) {
+            throw APIError.create('field_required', null, { key: 'audio' });
+        }
+
+        if ( !(audio instanceof FileFacade) ) {
+            throw APIError.create('field_invalid', null, {
+                key: 'audio',
+                expected: 'file reference',
+            });
+        }
+
+        const {
+            buffer,
+            filename,
+            mimeType,
+            estimatedSeconds,
+        } = await this._prepareAudioBuffer(audio);
+
+        const modelId = model_id || model || this.defaultModelId || DEFAULT_MODEL;
+        const selectedVoiceId = voice_id || voiceId || voice || this.defaultVoiceId;
+
+        if ( !selectedVoiceId ) {
+            throw APIError.create('field_required', null, { key: 'voice' });
+        }
+
+        const actor = Context.get('actor');
+        const usageKey = `elevenlabs:${modelId}:second`;
+        const usageAllowed = await this.meteringService.hasEnoughCreditsFor(actor, usageKey, estimatedSeconds);
+        if ( !usageAllowed ) {
+            throw APIError.create('insufficient_funds');
+        }
+
+        const formData = new FormData();
+        const blob = new Blob([buffer], { type: mimeType || 'application/octet-stream' });
+        formData.append('audio', blob, filename);
+        formData.append('model_id', modelId);
+
+        const mergedVoiceSettings = voice_settings ?? voiceSettings;
+        if ( mergedVoiceSettings !== undefined && mergedVoiceSettings !== null ) {
+            const serializedSettings = typeof mergedVoiceSettings === 'string'
+                ? mergedVoiceSettings
+                : JSON.stringify(mergedVoiceSettings);
+            formData.append('voice_settings', serializedSettings);
+        }
+
+        if ( seed !== undefined && seed !== null ) {
+            formData.append('seed', seed);
+        }
+
+        if ( typeof remove_background_noise === 'boolean' ) {
+            formData.append('remove_background_noise', String(remove_background_noise));
+        }
+
+        if ( file_format ) {
+            formData.append('file_format', file_format);
+        }
+
+        const searchParams = new URLSearchParams();
+        const desiredOutputFormat = output_format || DEFAULT_OUTPUT_FORMAT;
+        if ( desiredOutputFormat ) {
+            searchParams.set('output_format', desiredOutputFormat);
+        }
+        if ( optimize_streaming_latency !== undefined && optimize_streaming_latency !== null ) {
+            searchParams.set('optimize_streaming_latency', optimize_streaming_latency);
+        }
+        if ( enable_logging !== undefined && enable_logging !== null ) {
+            searchParams.set('enable_logging', enable_logging);
+        }
+
+        const url = new URL(`/v1/speech-to-speech/${selectedVoiceId}`, this.baseUrl);
+        const search = searchParams.toString();
+        if ( search ) {
+            url.search = search;
+        }
+
+        const response = await fetch(url, {
+            method: 'POST',
+            headers: {
+                'xi-api-key': this.apiKey,
+            },
+            body: formData,
+        });
+
+        if ( !response.ok ) {
+            let detail = null;
+            try {
+                detail = await response.json();
+            } catch ( e ) {
+                // ignore
+            }
+            this.log.error('ElevenLabs voice changer request failed', {
+                status: response.status,
+                detail,
+            });
+            throw APIError.create('internal_server_error', null, {
+                provider: 'elevenlabs',
+                status: response.status,
+            });
+        }
+
+        const arrayBuffer = await response.arrayBuffer();
+        const responseBuffer = Buffer.from(arrayBuffer);
+        const stream = Readable.from(responseBuffer);
+
+        this.meteringService.incrementUsage(actor, usageKey, estimatedSeconds);
+
+        return new TypedValue({
+            $: 'stream',
+            content_type: response.headers.get('content-type') || 'audio/mpeg',
+        }, stream);
+    }
+
+    async _prepareAudioBuffer (file) {
+        const buffer = await file.get('buffer');
+        if ( !buffer || !buffer.length ) {
+            throw APIError.create('field_invalid', null, {
+                key: 'audio',
+                expected: 'non-empty audio file',
+            });
+        }
+
+        if ( buffer.length > MAX_AUDIO_FILE_SIZE ) {
+            throw APIError.create('file_too_large', null, {
+                max_size: MAX_AUDIO_FILE_SIZE,
+            });
+        }
+
+        let filename = 'audio';
+        let mimeType;
+
+        const pathValue = await file.get('path');
+        if ( pathValue ) {
+            filename = this.modules.path.basename(pathValue);
+        } else {
+            const url = await file.get('web_url');
+            if ( url ) {
+                try {
+                    const parsed = new URL(url);
+                    const candidate = this.modules.path.basename(parsed.pathname);
+                    if ( candidate ) filename = candidate;
+                } catch (_) {
+                    // Ignore URL parsing errors; we'll fall back to defaults.
+                }
+            }
+        }
+
+        const dataUrl = await file.get('data_url');
+        if ( dataUrl ) {
+            const match = /^data:([^;,]+)[;,]/.exec(dataUrl);
+            if ( match ) {
+                mimeType = match[1];
+            }
+        }
+
+        if ( ! mimeType ) {
+            const guessedMime = this.modules.mime.lookup(filename);
+            if ( guessedMime ) {
+                mimeType = guessedMime;
+            }
+        }
+
+        if ( ! filename.includes('.') ) {
+            const extension = mimeType ? this.modules.mime.extension(mimeType) : 'mp3';
+            filename = `${filename}.${extension || 'mp3'}`;
+        }
+
+        let estimatedSeconds = Math.ceil(buffer.length / 16000);
+        try {
+            const metadata = await this.modules.musicMetadata.parseBuffer(buffer, {
+                mimeType,
+                size: buffer.length,
+            });
+            if ( metadata?.format?.duration ) {
+                estimatedSeconds = Math.ceil(metadata.format.duration);
+            }
+        } catch (e) {
+            if ( process.env.DEBUG_AUDIO_METADATA === '1' ) {
+                console.warn('Failed to parse audio metadata for duration estimation:', e.message);
+            }
+        }
+
+        estimatedSeconds = Math.max(1, estimatedSeconds);
+
+        return {
+            buffer,
+            filename,
+            mimeType,
+            estimatedSeconds,
+        };
+    }
+}
+
+module.exports = {
+    ElevenLabsVoiceChangerService,
+};
--- a/src/backend/src/modules/puterai/PuterAIModule.js
+++ b/src/backend/src/modules/puterai/PuterAIModule.js
@@ -58,6 +58,9 @@ class PuterAIModule extends AdvancedBase {
        if ( config?.services?.['elevenlabs'] || config?.elevenlabs ) {
            const { ElevenLabsTTSService } = require('./ElevenLabsTTSService');
            services.registerService('elevenlabs-tts', ElevenLabsTTSService);
+
+            const { ElevenLabsVoiceChangerService } = require('./ElevenLabsVoiceChangerService');
+            services.registerService('elevenlabs-voice-changer', ElevenLabsVoiceChangerService);
        }

        if ( config?.services?.openai || config?.openai ) {
--- a/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts
+++ b/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts
@@ -10,4 +10,6 @@ export const ELEVENLABS_COST_MAP = {
    'elevenlabs:eleven_turbo_v2_5:character': 11,
    'elevenlabs:eleven_flash_v2_5:character': 5.5,
    'elevenlabs:eleven_v3:character': 11,
+    'elevenlabs:eleven_multilingual_sts_v2:second': 11,
+    'elevenlabs:eleven_english_sts_v2:second': 11,
 };
--- a/src/backend/src/services/drivers/DriverService.js
+++ b/src/backend/src/services/drivers/DriverService.js
@@ -287,6 +287,7 @@ class DriverService extends BaseService {
        const iface_to_driver = {
            ['puter-ocr']: 'aws-textract',
            ['puter-tts']: 'aws-polly',
+            ['puter-speech2speech']: 'elevenlabs-voice-changer',
            ['puter-speech2txt']: 'openai-speech2txt',
            ['puter-chat-completion']: 'openai-completion',
            ['puter-image-generation']: 'openai-image-generation',
--- a/src/puter-js/src/modules/AI.js
+++ b/src/puter-js/src/modules/AI.js
@@ -386,6 +386,118 @@ class AI {
        }).call(this, options);
    };

+    speech2speech = async (...args) => {
+        const MAX_INPUT_SIZE = 25 * 1024 * 1024;
+        if ( !args || !args.length ) {
+            throw ({ message: 'Arguments are required', code: 'arguments_required' });
+        }
+
+        const normalizeSource = async (value) => {
+            if ( value instanceof Blob ) {
+                return await utils.blobToDataUri(value);
+            }
+            return value;
+        };
+
+        const normalizeOptions = (opts = {}) => {
+            const normalized = { ...opts };
+            if ( normalized.voiceId && !normalized.voice && !normalized.voice_id ) normalized.voice = normalized.voiceId;
+            if ( normalized.modelId && !normalized.model && !normalized.model_id ) normalized.model = normalized.modelId;
+            if ( normalized.outputFormat && !normalized.output_format ) normalized.output_format = normalized.outputFormat;
+            if ( normalized.voiceSettings && !normalized.voice_settings ) normalized.voice_settings = normalized.voiceSettings;
+            if ( normalized.fileFormat && !normalized.file_format ) normalized.file_format = normalized.fileFormat;
+            if ( normalized.removeBackgroundNoise !== undefined && normalized.remove_background_noise === undefined ) {
+                normalized.remove_background_noise = normalized.removeBackgroundNoise;
+            }
+            if ( normalized.optimizeStreamingLatency !== undefined && normalized.optimize_streaming_latency === undefined ) {
+                normalized.optimize_streaming_latency = normalized.optimizeStreamingLatency;
+            }
+            if ( normalized.enableLogging !== undefined && normalized.enable_logging === undefined ) {
+                normalized.enable_logging = normalized.enableLogging;
+            }
+            delete normalized.voiceId;
+            delete normalized.modelId;
+            delete normalized.outputFormat;
+            delete normalized.voiceSettings;
+            delete normalized.fileFormat;
+            delete normalized.removeBackgroundNoise;
+            delete normalized.optimizeStreamingLatency;
+            delete normalized.enableLogging;
+            return normalized;
+        };
+
+        let options = {};
+        let testMode = false;
+
+        const primary = args[0];
+        if ( primary && typeof primary === 'object' && !Array.isArray(primary) && !(primary instanceof Blob) ) {
+            options = { ...primary };
+        } else {
+            options.audio = await normalizeSource(primary);
+        }
+
+        if ( args[1] && typeof args[1] === 'object' && !Array.isArray(args[1]) && !(args[1] instanceof Blob) ) {
+            options = { ...options, ...args[1] };
+        } else if ( typeof args[1] === 'boolean' ) {
+            testMode = args[1];
+        }
+
+        if ( typeof args[2] === 'boolean' ) {
+            testMode = args[2];
+        }
+
+        if ( options.file ) {
+            options.audio = await normalizeSource(options.file);
+            delete options.file;
+        }
+
+        if ( options.audio instanceof Blob ) {
+            options.audio = await normalizeSource(options.audio);
+        }
+
+        if ( ! options.audio ) {
+            throw { message: 'Audio input is required', code: 'audio_required' };
+        }
+
+        if ( typeof options.audio === 'string' && options.audio.startsWith('data:') ) {
+            const base64 = options.audio.split(',')[1] || '';
+            const padding = base64.endsWith('==') ? 2 : (base64.endsWith('=') ? 1 : 0);
+            const byteLength = Math.floor((base64.length * 3) / 4) - padding;
+            if ( byteLength > MAX_INPUT_SIZE ) {
+                throw { message: 'Input size cannot be larger than 25 MB', code: 'input_too_large' };
+            }
+        }
+
+        const driverArgs = normalizeOptions({ ...options });
+        delete driverArgs.provider;
+
+        return await utils.make_driver_method(['audio'], 'puter-speech2speech', 'elevenlabs-voice-changer', 'convert', {
+            responseType: 'blob',
+            test_mode: testMode,
+            transform: async (result) => {
+                let url;
+                if ( typeof result === 'string' ) {
+                    url = result;
+                } else if ( result instanceof Blob ) {
+                    url = await utils.blob_to_url(result);
+                } else if ( result instanceof ArrayBuffer ) {
+                    const blob = new Blob([result]);
+                    url = await utils.blob_to_url(blob);
+                } else if ( result && typeof result === 'object' && typeof result.arrayBuffer === 'function' ) {
+                    const arrayBuffer = await result.arrayBuffer();
+                    const blob = new Blob([arrayBuffer], { type: result.type || undefined });
+                    url = await utils.blob_to_url(blob);
+                } else {
+                    throw { message: 'Unexpected audio response format', code: 'invalid_audio_response' };
+                }
+                const audio = new Audio(url);
+                audio.toString = () => url;
+                audio.valueOf = () => url;
+                return audio;
+            },
+        }).call(this, driverArgs);
+    };
+
    speech2txt = async (...args) => {
        const MAX_INPUT_SIZE = 25 * 1024 * 1024;
        if ( !args || !args.length ) {