mirror of
https://github.com/HeyPuter/puter.git
synced 2026-01-10 07:01:51 -06:00
Add ElevenLabs TTS provider integration (#2004)
* Add ElevenLabs TTS provider integration Adds ElevenLabs text-to-speech support in the backend and frontend. Adds `ElevenLabsTTSService`, updates `PuterAIModule` to register the service, documents configuration, and integrates cost tracking with zero cost in `MeteringService` for now (todo). updates `AI.js` to support 11labs as a provider and adds related tests for `txt2speech` functionality. * Update 11labs cost map values
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
'use strict';
|
||||
"use strict";
|
||||
// Code generated by protoc-gen-ts_proto. DO NOT EDIT.
|
||||
// versions:
|
||||
// protoc-gen-ts_proto v2.8.0
|
||||
// protoc v3.21.12
|
||||
// source: fsentry.proto
|
||||
Object.defineProperty(exports, '__esModule', { value: true });
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.FSEntry = exports.protobufPackage = void 0;
|
||||
/* eslint-disable */
|
||||
const wire_1 = require("@bufbuild/protobuf/wire");
|
||||
|
||||
196
src/backend/src/modules/puterai/ElevenLabsTTSService.js
Normal file
196
src/backend/src/modules/puterai/ElevenLabsTTSService.js
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (C) 2024-present Puter Technologies Inc.
|
||||
*
|
||||
* This file is part of Puter.
|
||||
*
|
||||
* Puter is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
const { Readable } = require('stream');
|
||||
const APIError = require('../../api/APIError');
|
||||
const BaseService = require('../../services/BaseService');
|
||||
const { TypedValue } = require('../../services/drivers/meta/Runtime');
|
||||
const { Context } = require('../../util/context');
|
||||
|
||||
const DEFAULT_MODEL = 'eleven_multilingual_v2';
|
||||
const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Common public "Rachel" sample voice
|
||||
const DEFAULT_OUTPUT_FORMAT = 'mp3_44100_128';
|
||||
const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3';
|
||||
|
||||
const ELEVENLABS_TTS_MODELS = [
|
||||
{ id: DEFAULT_MODEL, name: 'Eleven Multilingual v2' },
|
||||
{ id: 'eleven_flash_v2_5', name: 'Eleven Flash v2.5' },
|
||||
{ id: 'eleven_turbo_v2_5', name: 'Eleven Turbo v2.5' },
|
||||
{ id: 'eleven_v3', name: 'Eleven v3 Alpha' },
|
||||
];
|
||||
|
||||
/**
|
||||
* ElevenLabs text-to-speech provider.
|
||||
* Implements the `puter-tts` interface so the AI module can synthesize speech
|
||||
* using ElevenLabs voices.
|
||||
*/
|
||||
class ElevenLabsTTSService extends BaseService {
|
||||
/** @type {import('../../services/MeteringService/MeteringService').MeteringService} */
|
||||
get meteringService () {
|
||||
return this.services.get('meteringService').meteringService;
|
||||
}
|
||||
|
||||
static IMPLEMENTS = {
|
||||
['driver-capabilities']: {
|
||||
supports_test_mode (iface, method_name) {
|
||||
return iface === 'puter-tts' && method_name === 'synthesize';
|
||||
},
|
||||
},
|
||||
['puter-tts']: {
|
||||
async list_voices () {
|
||||
return this.listVoices();
|
||||
},
|
||||
async list_engines () {
|
||||
return this.listEngines();
|
||||
},
|
||||
async synthesize (params) {
|
||||
return this.synthesize(params);
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
async _init () {
|
||||
const svcThere = this.global_config?.services?.elevenlabs ?? this.config?.services?.elevenlabs ?? this.config?.elevenlabs;
|
||||
|
||||
this.apiKey = svcThere?.apiKey ?? svcThere?.api_key ?? svcThere?.key;
|
||||
this.baseUrl = svcThere?.baseUrl ?? 'https://api.elevenlabs.io';
|
||||
this.defaultVoiceId = svcThere?.defaultVoiceId ?? svcThere?.voiceId ?? DEFAULT_VOICE_ID;
|
||||
|
||||
if ( !this.apiKey ) {
|
||||
throw new Error('ElevenLabs API key not configured');
|
||||
}
|
||||
}
|
||||
|
||||
async request (path, { method = 'GET', body, headers = {} } = {}) {
|
||||
const response = await fetch(`${this.baseUrl}${path}`, {
|
||||
method,
|
||||
headers: {
|
||||
'xi-api-key': this.apiKey,
|
||||
...(body ? { 'Content-Type': 'application/json' } : {}),
|
||||
...headers,
|
||||
},
|
||||
body: body ? JSON.stringify(body) : undefined,
|
||||
});
|
||||
|
||||
if ( response.ok ) {
|
||||
return response;
|
||||
}
|
||||
|
||||
let detail = null;
|
||||
try {
|
||||
detail = await response.json();
|
||||
} catch ( e ) {
|
||||
// ignore
|
||||
}
|
||||
this.log.error('ElevenLabs request failed', { path, status: response.status, detail });
|
||||
throw APIError.create('internal_server_error', null, { provider: 'elevenlabs', status: response.status });
|
||||
}
|
||||
|
||||
async listVoices () {
|
||||
const res = await this.request('/v1/voices');
|
||||
const data = await res.json();
|
||||
const voices = Array.isArray(data?.voices) ? data.voices : Array.isArray(data) ? data : [];
|
||||
|
||||
return voices
|
||||
.map(voice => ({
|
||||
id: voice.voice_id || voice.voiceId || voice.id,
|
||||
name: voice.name,
|
||||
description: voice.description,
|
||||
category: voice.category,
|
||||
provider: 'elevenlabs',
|
||||
labels: voice.labels,
|
||||
supported_models: ELEVENLABS_TTS_MODELS.map(model => model.id),
|
||||
}))
|
||||
.filter(v => v.id && v.name);
|
||||
}
|
||||
|
||||
async listEngines () {
|
||||
return ELEVENLABS_TTS_MODELS.map(model => ({
|
||||
id: model.id,
|
||||
name: model.name,
|
||||
provider: 'elevenlabs',
|
||||
pricing_per_million_chars: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
async synthesize (params) {
|
||||
const {
|
||||
text,
|
||||
voice,
|
||||
model,
|
||||
response_format,
|
||||
output_format,
|
||||
voice_settings,
|
||||
voiceSettings,
|
||||
test_mode,
|
||||
} = params;
|
||||
if ( test_mode ) {
|
||||
return new TypedValue({
|
||||
$: 'string:url:web',
|
||||
content_type: 'audio',
|
||||
}, SAMPLE_AUDIO_URL);
|
||||
}
|
||||
|
||||
if ( typeof text !== 'string' || !text.trim() ) {
|
||||
throw APIError.create('field_required', null, { key: 'text' });
|
||||
}
|
||||
|
||||
const voiceId = voice || this.defaultVoiceId;
|
||||
const modelId = model || DEFAULT_MODEL;
|
||||
const desiredFormat = output_format || response_format || DEFAULT_OUTPUT_FORMAT;
|
||||
|
||||
const actor = Context.get('actor');
|
||||
const usageKey = `elevenlabs:${modelId}:character`;
|
||||
const usageAllowed = await this.meteringService.hasEnoughCreditsFor(actor, usageKey, text.length);
|
||||
if ( !usageAllowed ) {
|
||||
throw APIError.create('insufficient_funds');
|
||||
}
|
||||
|
||||
const payload = {
|
||||
text,
|
||||
model_id: modelId,
|
||||
output_format: desiredFormat,
|
||||
};
|
||||
|
||||
const finalVoiceSettings = voice_settings ?? voiceSettings;
|
||||
if ( finalVoiceSettings ) {
|
||||
payload.voice_settings = finalVoiceSettings;
|
||||
}
|
||||
|
||||
const response = await this.request(`/v1/text-to-speech/${voiceId}`, {
|
||||
method: 'POST',
|
||||
body: payload,
|
||||
});
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const buffer = Buffer.from(arrayBuffer);
|
||||
const stream = Readable.from(buffer);
|
||||
|
||||
this.meteringService.incrementUsage(actor, usageKey, text.length);
|
||||
|
||||
return new TypedValue({
|
||||
$: 'stream',
|
||||
content_type: response.headers.get('content-type') || 'audio/mpeg',
|
||||
}, stream);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
ElevenLabsTTSService,
|
||||
};
|
||||
@@ -55,6 +55,11 @@ class PuterAIModule extends AdvancedBase {
|
||||
services.registerService('aws-polly', AWSPollyService);
|
||||
}
|
||||
|
||||
if ( config?.services?.['elevenlabs'] || config?.elevenlabs ) {
|
||||
const { ElevenLabsTTSService } = require('./ElevenLabsTTSService');
|
||||
services.registerService('elevenlabs-tts', ElevenLabsTTSService);
|
||||
}
|
||||
|
||||
if ( config?.services?.openai || config?.openai ) {
|
||||
const { OpenAICompletionServiceWrapper } = require('./OpenAiCompletionService/index.mjs');
|
||||
services.registerService('openai-completion', OpenAICompletionServiceWrapper);
|
||||
|
||||
@@ -9,6 +9,10 @@ AI services are configured under the `services` block in the configuration file.
|
||||
"openai": {
|
||||
"apiKey": "sk-abcdefg..."
|
||||
},
|
||||
"elevenlabs": {
|
||||
"apiKey": "eleven-api-key",
|
||||
"defaultVoiceId": "optional-voice-id"
|
||||
},
|
||||
"deepseek": {
|
||||
"apiKey": "sk-xyz123..."
|
||||
},
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
// ElevenLabs Text-to-Speech Cost Map
|
||||
//
|
||||
// Pricing for ElevenLabs voices varies by model and plan tier. We don't yet
|
||||
// have public micro-cent pricing, so we record usage with a zero cost. This
|
||||
// prevents metering alerts while still tracking character counts for future
|
||||
// cost attribution once pricing is finalized.
|
||||
|
||||
export const ELEVENLABS_COST_MAP = {
|
||||
'elevenlabs:eleven_multilingual_v2:character': 11,
|
||||
'elevenlabs:eleven_turbo_v2_5:character': 11,
|
||||
'elevenlabs:eleven_flash_v2_5:character': 5.5,
|
||||
'elevenlabs:eleven_v3:character': 11,
|
||||
};
|
||||
@@ -13,12 +13,14 @@ import { OPENROUTER_COST_MAP } from './openrouterCostMap';
|
||||
import { OPENAI_VIDEO_COST_MAP } from './openaiVideoCostMap';
|
||||
import { TOGETHER_COST_MAP } from './togetherCostMap';
|
||||
import { XAI_COST_MAP } from './xaiCostMap';
|
||||
import { ELEVENLABS_COST_MAP } from './elevenlabsCostMap';
|
||||
|
||||
export const COST_MAPS = {
|
||||
...AWS_POLLY_COST_MAP,
|
||||
...AWS_TEXTRACT_COST_MAP,
|
||||
...CLAUDE_COST_MAP,
|
||||
...DEEPSEEK_COST_MAP,
|
||||
...ELEVENLABS_COST_MAP,
|
||||
...GEMINI_COST_MAP,
|
||||
...GROQ_COST_MAP,
|
||||
...KV_COST_MAP,
|
||||
|
||||
@@ -6,6 +6,7 @@ const normalizeTTSProvider = (value) => {
|
||||
}
|
||||
const lower = value.toLowerCase();
|
||||
if ( lower === 'openai' ) return 'openai';
|
||||
if ( ['elevenlabs', 'eleven', '11labs', '11-labs', 'eleven-labs', 'elevenlabs-tts'].includes(lower) ) return 'elevenlabs';
|
||||
if ( lower === 'aws' || lower === 'polly' || lower === 'aws-polly' ) return 'aws-polly';
|
||||
return value;
|
||||
};
|
||||
@@ -281,6 +282,10 @@ class AI {
|
||||
provider = 'openai';
|
||||
}
|
||||
|
||||
if ( options.engine && normalizeTTSProvider(options.engine) === 'elevenlabs' && !options.provider ) {
|
||||
provider = 'elevenlabs';
|
||||
}
|
||||
|
||||
if ( provider === 'openai' ) {
|
||||
if ( !options.model && typeof options.engine === 'string' ) {
|
||||
options.model = options.engine;
|
||||
@@ -295,6 +300,23 @@ class AI {
|
||||
options.response_format = 'mp3';
|
||||
}
|
||||
delete options.engine;
|
||||
} else if ( provider === 'elevenlabs' ) {
|
||||
if ( ! options.voice ) {
|
||||
options.voice = '21m00Tcm4TlvDq8ikWAM';
|
||||
}
|
||||
if ( ! options.model && typeof options.engine === 'string' ) {
|
||||
options.model = options.engine;
|
||||
}
|
||||
if ( ! options.model ) {
|
||||
options.model = 'eleven_multilingual_v2';
|
||||
}
|
||||
if ( ! options.output_format && !options.response_format ) {
|
||||
options.output_format = 'mp3_44100_128';
|
||||
}
|
||||
if ( options.response_format && !options.output_format ) {
|
||||
options.output_format = options.response_format;
|
||||
}
|
||||
delete options.engine;
|
||||
} else {
|
||||
provider = 'aws-polly';
|
||||
|
||||
@@ -326,7 +348,9 @@ class AI {
|
||||
}
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
|
||||
const driverName = provider === 'openai'
|
||||
? 'openai-tts'
|
||||
: (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
|
||||
|
||||
return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'synthesize', {
|
||||
responseType: 'blob',
|
||||
@@ -449,7 +473,13 @@ class AI {
|
||||
params.provider = 'openai';
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
|
||||
if ( provider === 'elevenlabs' ) {
|
||||
params.provider = 'elevenlabs';
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai'
|
||||
? 'openai-tts'
|
||||
: (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
|
||||
|
||||
return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_engines', {
|
||||
responseType: 'text',
|
||||
@@ -478,7 +508,13 @@ class AI {
|
||||
delete params.engine;
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
|
||||
if ( provider === 'elevenlabs' ) {
|
||||
params.provider = 'elevenlabs';
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai'
|
||||
? 'openai-tts'
|
||||
: (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
|
||||
|
||||
return utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_voices', {
|
||||
responseType: 'text',
|
||||
|
||||
@@ -157,6 +157,32 @@ const testTxt2SpeechWithOpenAIProviderCore = async function() {
|
||||
assert(valueOfValue === srcValue, "valueOf() should match src for OpenAI provider");
|
||||
};
|
||||
|
||||
const testTxt2SpeechWithElevenLabsProviderCore = async function() {
|
||||
// Test ElevenLabs provider in test mode to avoid external calls
|
||||
const result = await puter.ai.txt2speech(
|
||||
"Hello, this is an ElevenLabs provider test.",
|
||||
{ provider: "elevenlabs", voice: "21m00Tcm4TlvDq8ikWAM" },
|
||||
true,
|
||||
);
|
||||
|
||||
assert(result instanceof Audio, "txt2speech should return an Audio object for ElevenLabs provider");
|
||||
assert(result !== null, "txt2speech should not return null for ElevenLabs provider");
|
||||
|
||||
const toStringValue = result.toString();
|
||||
const valueOfValue = result.valueOf();
|
||||
const srcValue = result.src;
|
||||
|
||||
assert(typeof toStringValue === 'string', "toString() should return a string for ElevenLabs provider");
|
||||
assert(typeof valueOfValue === 'string', "valueOf() should return a string for ElevenLabs provider");
|
||||
assert(typeof srcValue === 'string', "src should be a string for ElevenLabs provider");
|
||||
assert(toStringValue.length > 0, "toString() should not be empty for ElevenLabs provider");
|
||||
assert(valueOfValue.length > 0, "valueOf() should not be empty for ElevenLabs provider");
|
||||
assert(srcValue.length > 0, "src should not be empty for ElevenLabs provider");
|
||||
|
||||
assert(toStringValue === srcValue, "toString() should match src for ElevenLabs provider");
|
||||
assert(valueOfValue === srcValue, "valueOf() should match src for ElevenLabs provider");
|
||||
};
|
||||
|
||||
// Export test functions
|
||||
window.txt2speechTests = [
|
||||
{
|
||||
@@ -209,5 +235,18 @@ window.txt2speechTests = [
|
||||
fail("testTxt2SpeechWithOpenAIProvider failed:", error);
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
name: "testTxt2SpeechWithElevenLabsProvider",
|
||||
description: "Test text-to-speech using the ElevenLabs provider in test mode",
|
||||
test: async function() {
|
||||
try {
|
||||
await testTxt2SpeechWithElevenLabsProviderCore();
|
||||
pass("testTxt2SpeechWithElevenLabsProvider passed");
|
||||
} catch (error) {
|
||||
fail("testTxt2SpeechWithElevenLabsProvider failed:", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
];
|
||||
|
||||
Reference in New Issue
Block a user