mirror of
https://github.com/mudler/LocalAI.git
synced 2026-01-06 10:39:55 -06:00
* feat: split remaining backends and drop embedded backends - Drop silero-vad, huggingface, and stores backend from embedded binaries - Refactor Makefile and Dockerfile to avoid building grpc backends - Drop golang code that was used to embed backends - Simplify building by using goreleaser Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore(gallery): be specific with llama-cpp backend templates Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore(docs): update Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore(ci): minor fixes Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: drop all ffmpeg references Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: run protogen-go Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Always enable p2p mode Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update gorelease file Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(stores): do not always load Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fix linting issues Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Simplify Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Mac OS fixup Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
43 lines
1.5 KiB
YAML
43 lines
1.5 KiB
YAML
---
|
|
name: "vllm"
|
|
|
|
config_file: |
|
|
backend: vllm
|
|
context_size: 8192
|
|
parameters:
|
|
max_tokens: 8192
|
|
backend: vllm
|
|
function:
|
|
disable_no_action: true
|
|
grammar:
|
|
disable: true
|
|
parallel_calls: true
|
|
expect_strings_after_json: true
|
|
template:
|
|
use_tokenizer_template: true
|
|
# Uncomment to specify a quantization method (optional)
|
|
# quantization: "awq"
|
|
# Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
|
|
# dtype: "float16"
|
|
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
|
# gpu_memory_utilization: 0.5
|
|
# Uncomment to trust remote code from huggingface
|
|
# trust_remote_code: true
|
|
# Uncomment to enable eager execution
|
|
# enforce_eager: true
|
|
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
|
|
# swap_space: 2
|
|
# Uncomment to specify the maximum length of a sequence (including prompt and output)
|
|
# max_model_len: 32768
|
|
# Uncomment and specify the number of Tensor divisions.
|
|
# Allows you to partition and run large models. Performance gains are limited.
|
|
# https://github.com/vllm-project/vllm/issues/1435
|
|
# tensor_parallel_size: 2
|
|
# Uncomment to disable log stats
|
|
# disable_log_stats: true
|
|
# Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
|
|
# limit_mm_per_prompt:
|
|
# image: 2
|
|
# video: 2
|
|
# audio: 2
|