LocalAI/gallery/vibevoice.yaml

---
name: localai

config_file: |-
    name: vibevoice
    backend: vibevoice
    description: |
      VibeVoice-Realtime is a real-time text-to-speech model that generates natural-sounding speech.
      This model supports voice cloning through voice preset files (.pt files).

    parameters:
      model: microsoft/VibeVoice-Realtime-0.5B

    # TTS configuration
    tts:
      # Voice selection - can be:
      # 1. Voice preset name (e.g., "Frank", "en-Frank_man", "Grace") - looks for .pt files in voices/streaming_model/
      # 2. Path to a voice preset .pt file (relative to model directory or absolute)
      # Available English voices: Carter, Davis, Emma, Frank, Grace, Mike
      voice: "Frank"
      # Alternative: use audio_path to specify a voice file directly
      # audio_path: "voices/streaming_model/en-Frank_man.pt"

    known_usecases:
      - tts

    # Backend-specific options
    # These are passed as "key:value" strings to the backend
    options:
      # CFG (Classifier-Free Guidance) scale for generation (default: 1.5)
      # Higher values can improve quality but may slow generation
      - "cfg_scale:1.5"
      # Number of inference steps for the diffusion process (default: 5)
      # More steps = better quality but slower. Typical range: 3-10
      - "inference_steps:5"
      # Enable sampling (default: false)
      # When true, uses temperature and top_p for sampling
      - "do_sample:false"
      # Temperature for sampling (only used if do_sample=true, default: 0.9)
      - "temperature:0.9"
      # Top-p (nucleus) sampling (only used if do_sample=true, default: 0.9)
      - "top_p:0.9"
      # Voices directory path
      # This explicitly sets where to look for voice preset files (.pt files)
      # Since we're downloading voices to voices/streaming_model/, we set it here
      #
      # Examples:
      #   - Relative path (relative to models directory): "voices/streaming_model"
      #   - Absolute path: "/custom/path/to/voices/streaming_model"
      #   - Custom relative path: "my_custom_voices/streaming_model"
      #
      # If not specified, the backend will auto-detect from common locations:
      #   1. {ModelFile directory}/voices/streaming_model/
      #   2. {models_dir}/voices/streaming_model/
      #   3. Backend directory
      - "voices_dir:voices/streaming_model"
    # # Download voice preset files
    # # Voice presets are downloaded to: {models_dir}/voices/streaming_model/
    # # The voices_dir option above tells the backend to look in this location
    # download_files:
    #   # English voices
    #   - filename: voices/streaming_model/en-Frank_man.pt
    #     uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Frank_man.pt
    #   - filename: voices/streaming_model/en-Grace_woman.pt
    #     uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Grace_woman.pt
    #   - filename: voices/streaming_model/en-Mike_man.pt
    #     uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Mike_man.pt
    #   - filename: voices/streaming_model/en-Emma_woman.pt
    #     uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Emma_woman.pt
    #   - filename: voices/streaming_model/en-Carter_man.pt
    #     uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Carter_man.pt
    #   - filename: voices/streaming_model/en-Davis_man.pt
    #     uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Davis_man.pt
    #   # Uncomment to add more languages:
    #   # - filename: voices/streaming_model/fr-Spk0_man.pt
    #   #   uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/fr-Spk0_man.pt
    #   # - filename: voices/streaming_model/de-Spk0_man.pt
    #   #   uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/de-Spk0_man.pt