feat: do not bundle llama-cpp anymore (#5790)

* Build llama.cpp separately

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* WIP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* WIP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* WIP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Start to try to attach some tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add git and small fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: correctly autoload external backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Try to run AIO tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Slightly update the Makefile helps

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Adapt auto-bumper

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Try to run linux test

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add llama-cpp into build pipelines

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add default capability (for cpu)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Drop llama-cpp specific logic from the backend loader

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* drop grpc install in ci for tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Pass by backends path for tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Build protogen at start

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(tests): set backends path consistently

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Correctly configure the backends path

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Try to build for darwin

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* WIP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Compile for metal on arm64/darwin

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Try to run build off from cross-arch

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add to the backend index nvidia-l4t and cpu's llama-cpp backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Build also darwin-x86 for llama-cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Disable arm64 builds temporary

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Test backend build on PR

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Fixup build backend reusable workflow

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* pass by skip drivers

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Use crane

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Skip drivers

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* x86 darwin

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Add packaging step for llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Fix leftover from bark-cpp extraction

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Try to fix hipblas build

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-07-18 13:24:12 +02:00
committed by GitHub
parent 932f6b01a6
commit 294f7022f3
35 changed files with 1613 additions and 835 deletions

View File

@@ -3,7 +3,9 @@
.vscode
.devcontainer
models
backends
examples/chatbot-ui/models
backend/go/image/stablediffusion-ggml/build/
examples/rwkv/models
examples/**/models
Dockerfile*
@@ -14,4 +16,4 @@ __pycache__
# backend virtual environments
**/venv
backend/python/**/source
backend/python/**/source

View File

@@ -3,15 +3,20 @@ set -xe
REPO=$1
BRANCH=$2
VAR=$3
FILE=$4
if [ -z "$FILE" ]; then
FILE="Makefile"
fi
LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")
# Read $VAR from Makefile (only first match)
set +e
CURRENT_COMMIT="$(grep -m1 "^$VAR?=" Makefile | cut -d'=' -f2)"
CURRENT_COMMIT="$(grep -m1 "^$VAR?=" $FILE | cut -d'=' -f2)"
set -e
sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
sed -i $FILE -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
if [ -z "$CURRENT_COMMIT" ]; then
echo "Could not find $VAR in Makefile."

View File

@@ -7,7 +7,6 @@ on:
- master
tags:
- '*'
#pull_request:
concurrency:
group: ci-backends-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -26,8 +25,9 @@ jobs:
runs-on: ${{ matrix.runs-on }}
base-image: ${{ matrix.base-image }}
backend: ${{ matrix.backend }}
dockerfile: $${ matrix.dockerfile }}
context: $${ matrix.context }}
dockerfile: ${{ matrix.dockerfile }}
skip-drivers: ${{ matrix.skip-drivers }}
context: ${{ matrix.context }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -47,9 +47,22 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-rerankers'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "rerankers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'true'
tag-suffix: '-gpu-nvidia-cuda-11-llama-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
@@ -58,6 +71,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-vllm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "vllm"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -69,6 +83,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-transformers'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "transformers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -80,6 +95,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-diffusers'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "diffusers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -92,6 +108,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-kokoro'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "kokoro"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -103,6 +120,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-faster-whisper'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "faster-whisper"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -114,6 +132,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-coqui'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "coqui"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -125,6 +144,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-bark'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "bark"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -136,6 +156,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-11-chatterbox'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "chatterbox"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -148,9 +169,22 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-rerankers'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "rerankers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'true'
tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
@@ -159,6 +193,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-vllm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "vllm"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -170,6 +205,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-transformers'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "transformers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -181,6 +217,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-diffusers'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "diffusers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -193,6 +230,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-kokoro'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "kokoro"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -204,6 +242,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-faster-whisper'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "faster-whisper"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -215,6 +254,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-coqui'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "coqui"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -226,6 +266,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-bark'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "bark"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -237,6 +278,7 @@ jobs:
tag-suffix: '-gpu-nvidia-cuda-12-chatterbox'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "chatterbox"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -249,9 +291,22 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-rerankers'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "rerankers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'true'
tag-suffix: '-gpu-rocm-hipblas-llama-cpp'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
@@ -260,6 +315,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-vllm'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "vllm"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -271,6 +327,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-transformers'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "transformers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -282,6 +339,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-diffusers'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "diffusers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -294,6 +352,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-kokoro'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "kokoro"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -305,6 +364,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-faster-whisper'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "faster-whisper"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -316,6 +376,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-coqui'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "coqui"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -327,6 +388,7 @@ jobs:
tag-suffix: '-gpu-rocm-hipblas-bark'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-22.04:6.1"
skip-drivers: 'false'
backend: "bark"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -339,6 +401,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-rerankers'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "rerankers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -350,9 +413,34 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-rerankers'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "rerankers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'true'
tag-suffix: '-gpu-intel-sycl-f32-llama-cpp'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'true'
tag-suffix: '-gpu-intel-sycl-f16-llama-cpp'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
@@ -361,6 +449,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-vllm'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "vllm"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -372,6 +461,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-vllm'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "vllm"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -383,6 +473,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-transformers'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "transformers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -394,6 +485,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-transformers'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "transformers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -405,6 +497,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-diffusers'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "diffusers"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -417,6 +510,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-kokoro'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "kokoro"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -428,6 +522,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-kokoro'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "kokoro"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -439,6 +534,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-faster-whisper'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "faster-whisper"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -450,6 +546,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-faster-whisper'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "faster-whisper"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -461,6 +558,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-coqui'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "coqui"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -472,6 +570,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-coqui'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "coqui"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -483,6 +582,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f32-bark'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "bark"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -494,6 +594,7 @@ jobs:
tag-suffix: '-gpu-intel-sycl-f16-bark'
runs-on: 'ubuntu-latest'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
skip-drivers: 'false'
backend: "bark"
dockerfile: "./backend/Dockerfile.python"
context: "./backend"
@@ -506,6 +607,208 @@ jobs:
tag-suffix: '-bark-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
backend: "bark"
skip-drivers: 'false'
backend: "bark-cpp"
dockerfile: "./backend/Dockerfile.go"
context: "./"
context: "./"
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'true'
tag-suffix: '-cpu-llama-cpp'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'true'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-llama-cpp'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "llama-cpp"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
llama-cpp-darwin:
runs-on: macOS-14
strategy:
matrix:
go-version: ['1.21.x']
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}
cache: false
# You can test your matrix by printing the current Go version
- name: Display Go version
run: go version
- name: Dependencies
run: |
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
- name: Build llama-cpp-darwin
run: |
make protogen-go
make build-api
bash scripts/build-llama-cpp-darwin.sh
ls -la build/darwin.tar
mv build/darwin.tar build/llama-cpp.tar
- name: Upload llama-cpp.tar
uses: actions/upload-artifact@v4
with:
name: llama-cpp-tar
path: build/llama-cpp.tar
llama-cpp-darwin-publish:
needs: llama-cpp-darwin
runs-on: ubuntu-latest
steps:
- name: Download llama-cpp.tar
uses: actions/download-artifact@v4
with:
name: llama-cpp-tar
path: .
- name: Install crane
run: |
curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz
sudo mv crane /usr/local/bin/
- name: Log in to DockerHub
run: |
echo "${{ secrets.DOCKERHUB_PASSWORD }}" | crane auth login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
- name: Log in to quay.io
run: |
echo "${{ secrets.LOCALAI_REGISTRY_PASSWORD }}" | crane auth login quay.io -u "${{ secrets.LOCALAI_REGISTRY_USERNAME }}" --password-stdin
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: |
localai/localai-backends
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
type=sha
flavor: |
latest=auto
suffix=metal-darwin-arm64-llama-cpp,onlatest=true
- name: Docker meta
id: quaymeta
uses: docker/metadata-action@v5
with:
images: |
quay.io/go-skynet/local-ai-backends
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
type=sha
flavor: |
latest=auto
suffix=metal-darwin-arm64-llama-cpp,onlatest=true
- name: Push Docker image (DockerHub)
run: |
for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do
crane push llama-cpp.tar $tag
done
- name: Push Docker image (Quay)
run: |
for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
crane push llama-cpp.tar $tag
done
llama-cpp-darwin-x86:
runs-on: macos-13
strategy:
matrix:
go-version: ['1.21.x']
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}
cache: false
# You can test your matrix by printing the current Go version
- name: Display Go version
run: go version
- name: Dependencies
run: |
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
- name: Build llama-cpp-darwin
run: |
make protogen-go
make build-api
export PLATFORMARCH=darwin/amd64
bash scripts/build-llama-cpp-darwin.sh
ls -la build/darwin.tar
mv build/darwin.tar build/llama-cpp.tar
- name: Upload llama-cpp.tar
uses: actions/upload-artifact@v4
with:
name: llama-cpp-tar-x86
path: build/llama-cpp.tar
llama-cpp-darwin-x86-publish:
needs: llama-cpp-darwin-x86
runs-on: ubuntu-latest
steps:
- name: Download llama-cpp.tar
uses: actions/download-artifact@v4
with:
name: llama-cpp-tar-x86
path: .
- name: Install crane
run: |
curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz
sudo mv crane /usr/local/bin/
- name: Log in to DockerHub
run: |
echo "${{ secrets.DOCKERHUB_PASSWORD }}" | crane auth login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
- name: Log in to quay.io
run: |
echo "${{ secrets.LOCALAI_REGISTRY_PASSWORD }}" | crane auth login quay.io -u "${{ secrets.LOCALAI_REGISTRY_USERNAME }}" --password-stdin
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: |
localai/localai-backends
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
type=sha
flavor: |
latest=auto
suffix=darwin-x86-llama-cpp,onlatest=true
- name: Docker meta
id: quaymeta
uses: docker/metadata-action@v5
with:
images: |
quay.io/go-skynet/local-ai-backends
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
type=sha
flavor: |
latest=auto
suffix=darwin-x86-llama-cpp,onlatest=true
- name: Push Docker image (DockerHub)
run: |
for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do
crane push llama-cpp.tar $tag
done
- name: Push Docker image (Quay)
run: |
for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
crane push llama-cpp.tar $tag
done

View File

@@ -49,6 +49,10 @@ on:
description: 'Build Dockerfile'
required: true
type: string
skip-drivers:
description: 'Skip drivers'
default: 'false'
type: string
secrets:
dockerUsername:
required: true
@@ -197,12 +201,13 @@ jobs:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
BUILD_TYPE=${{ inputs.build-type }}
SKIP_DRIVERS=${{ inputs.skip-drivers }}
CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
BASE_IMAGE=${{ inputs.base-image }}
BACKEND=${{ inputs.backend }}
context: ./backend
file: ./backend/Dockerfile.python
context: ${{ inputs.context }}
file: ${{ inputs.dockerfile }}
cache-from: type=gha
platforms: ${{ inputs.platforms }}
push: ${{ github.event_name != 'pull_request' }}
@@ -216,12 +221,13 @@ jobs:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
BUILD_TYPE=${{ inputs.build-type }}
SKIP_DRIVERS=${{ inputs.skip-drivers }}
CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
BASE_IMAGE=${{ inputs.base-image }}
BACKEND=${{ inputs.backend }}
context: ./backend
file: ./backend/Dockerfile.python
context: ${{ inputs.context }}
file: ${{ inputs.dockerfile }}
cache-from: type=gha
platforms: ${{ inputs.platforms }}
push: true

View File

@@ -10,30 +10,36 @@ jobs:
matrix:
include:
- repository: "ggml-org/llama.cpp"
variable: "CPPLLAMA_VERSION"
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "ggml-org/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
file: "Makefile"
- repository: "PABannier/bark.cpp"
variable: "BARKCPP_VERSION"
branch: "main"
file: "Makefile"
- repository: "leejet/stable-diffusion.cpp"
variable: "STABLEDIFFUSION_GGML_VERSION"
branch: "master"
file: "Makefile"
- repository: "mudler/go-stable-diffusion"
variable: "STABLEDIFFUSION_VERSION"
branch: "master"
file: "Makefile"
- repository: "mudler/go-piper"
variable: "PIPER_VERSION"
branch: "master"
file: "Makefile"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Bump dependencies 🔧
id: bump
run: |
bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }} ${{ matrix.file }}
{
echo 'message<<EOF'
cat "${{ matrix.variable }}_message.txt"

View File

@@ -20,115 +20,140 @@ concurrency:
jobs:
build-linux-arm:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v5
with:
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
make install-go-tools
- name: Install CUDA Dependencies
run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
env:
CUDA_VERSION: 12-4
- name: Cache grpc
id: cache-grpc
uses: actions/cache@v4
with:
path: grpc
key: ${{ runner.os }}-arm-grpc-${{ env.GRPC_VERSION }}
- name: Build grpc
if: steps.cache-grpc.outputs.cache-hit != 'true'
run: |
# TODO: temporary disable linux-arm64 build
# build-linux-arm:
# runs-on: ubuntu-24.04-arm
# steps:
# - name: Free Disk Space (Ubuntu)
# uses: jlumbroso/free-disk-space@main
# with:
# # this might remove tools that are actually needed,
# # if set to "true" but frees about 6 GB
# tool-cache: true
# # all of these default to true, but feel free to set to
# # "false" if necessary for your workflow
# android: true
# dotnet: true
# haskell: true
# large-packages: true
# docker-images: true
# swap-storage: true
git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
cd grpc && sed -i "216i\ TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
cd cmake/build && cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
../.. && sudo make --jobs 5 --output-sync=target
- name: Install gRPC
run: |
GNU_HOST=aarch64-linux-gnu
C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
# - name: Release space from worker
# run: |
# echo "Listing top largest packages"
# pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
# head -n 30 <<< "${pkgs}"
# echo
# df -h
# echo
# sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
# sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
# sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
# sudo rm -rf /usr/local/lib/android
# sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
# sudo rm -rf /usr/share/dotnet
# sudo apt-get remove -y '^mono-.*' || true
# sudo apt-get remove -y '^ghc-.*' || true
# sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
# sudo apt-get remove -y 'php.*' || true
# sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
# sudo apt-get remove -y '^google-.*' || true
# sudo apt-get remove -y azure-cli || true
# sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
# sudo apt-get remove -y '^gfortran-.*' || true
# sudo apt-get remove -y microsoft-edge-stable || true
# sudo apt-get remove -y firefox || true
# sudo apt-get remove -y powershell || true
# sudo apt-get remove -y r-base-core || true
# sudo apt-get autoremove -y
# sudo apt-get clean
# echo
# echo "Listing top largest packages"
# pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
# head -n 30 <<< "${pkgs}"
# echo
# sudo rm -rfv build || true
# sudo rm -rf /usr/share/dotnet || true
# sudo rm -rf /opt/ghc || true
# sudo rm -rf "/usr/local/share/boost" || true
# sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
# df -h
CROSS_TOOLCHAIN=/usr/$GNU_HOST
CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
# https://cmake.org/cmake/help/v3.13/manual/cmake-toolchains.7.html#cross-compiling-for-linux
echo "set(CMAKE_SYSTEM_NAME Linux)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_SYSTEM_PROCESSOR arm)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_STAGING_PREFIX $CROSS_STAGING_PREFIX)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_SYSROOT ${CROSS_TOOLCHAIN}/sysroot)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_C_COMPILER /usr/bin/$C_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_CXX_COMPILER /usr/bin/$CXX_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN
GRPC_DIR=$PWD/grpc
cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install && \
GRPC_CROSS_BUILD_DIR=$GRPC_DIR/cmake/cross_build && \
mkdir -p $GRPC_CROSS_BUILD_DIR && \
cd $GRPC_CROSS_BUILD_DIR && \
cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$CROSS_TOOLCHAIN/grpc_install \
../.. && \
sudo make -j`nproc` install
- name: Build
id: build
run: |
GNU_HOST=aarch64-linux-gnu
C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
CROSS_TOOLCHAIN=/usr/$GNU_HOST
CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
export PATH=$PATH:$GOPATH/bin
export PATH=/usr/local/cuda/bin:$PATH
sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
sudo cp -rf /usr/aarch64-linux-gnu/lib/libstdc++.so* /usr/aarch64-linux-gnu/lib/libstdc++.so.6
sudo cp /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 ld.so
BACKEND_LIBS="./grpc/cmake/cross_build/third_party/re2/libre2.a ./grpc/cmake/cross_build/libgrpc.a ./grpc/cmake/cross_build/libgrpc++.a ./grpc/cmake/cross_build/third_party/protobuf/libprotobuf.a /usr/aarch64-linux-gnu/lib/libc.so.6 /usr/aarch64-linux-gnu/lib/libstdc++.so.6 /usr/aarch64-linux-gnu/lib/libgomp.so.1 /usr/aarch64-linux-gnu/lib/libm.so.6 /usr/aarch64-linux-gnu/lib/libgcc_s.so.1 /usr/aarch64-linux-gnu/lib/libdl.so.2 /usr/aarch64-linux-gnu/lib/libpthread.so.0 ./ld.so" \
GOOS=linux \
GOARCH=arm64 \
CMAKE_ARGS="-DProtobuf_INCLUDE_DIRS=$CROSS_STAGING_PREFIX/include -DProtobuf_DIR=$CROSS_STAGING_PREFIX/lib/cmake/protobuf -DgRPC_DIR=$CROSS_STAGING_PREFIX/lib/cmake/grpc -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++" make dist-cross-linux-arm64
- uses: actions/upload-artifact@v4
with:
name: LocalAI-linux-arm64
path: release/
- name: Release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
with:
files: |
release/*
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22
with:
detached: true
connect-timeout-seconds: 180
limit-access-to-actor: true
# - name: Force Install GIT latest
# run: |
# sudo apt-get update \
# && sudo apt-get install -y software-properties-common \
# && sudo apt-get update \
# && sudo add-apt-repository -y ppa:git-core/ppa \
# && sudo apt-get update \
# && sudo apt-get install -y git
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - uses: actions/setup-go@v5
# with:
# go-version: '1.21.x'
# cache: false
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
# make install-go-tools
# - name: Install CUDA Dependencies
# run: |
# curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
# sudo dpkg -i cuda-keyring_1.1-1_all.deb
# sudo apt-get update
# sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
# env:
# CUDA_VERSION: 12-5
# - name: Cache grpc
# id: cache-grpc
# uses: actions/cache@v4
# with:
# path: grpc
# key: ${{ runner.os }}-grpc-arm64-${{ env.GRPC_VERSION }}
# - name: Build grpc
# if: steps.cache-grpc.outputs.cache-hit != 'true'
# run: |
# git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
# cd grpc && sed -i "216i\ TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
# cd cmake/build && cmake -DgRPC_INSTALL=ON \
# -DgRPC_BUILD_TESTS=OFF \
# ../.. && sudo make --jobs 5 --output-sync=target
# - name: Install gRPC
# run: |
# cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
# # BACKEND_LIBS needed for gpu-workload: /opt/intel/oneapi/*/lib/libiomp5.so /opt/intel/oneapi/*/lib/libmkl_core.so /opt/intel/oneapi/*/lib/libmkl_core.so.2 /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so.2 /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so.4 /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so.2 /opt/intel/oneapi/*/lib/libsycl.so /opt/intel/oneapi/*/lib/libsycl.so.7 /opt/intel/oneapi/*/lib/libsycl.so.7.1.0 /opt/rocm-*/lib/libamdhip64.so /opt/rocm-*/lib/libamdhip64.so.5 /opt/rocm-*/lib/libamdhip64.so.6 /opt/rocm-*/lib/libamdhip64.so.6.1.60100 /opt/rocm-*/lib/libhipblas.so /opt/rocm-*/lib/libhipblas.so.2 /opt/rocm-*/lib/libhipblas.so.2.1.60100 /opt/rocm-*/lib/librocblas.so /opt/rocm-*/lib/librocblas.so.4 /opt/rocm-*/lib/librocblas.so.4.1.60100 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/local/cuda-*/targets/x86_64-linux/lib/libcublas.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcublasLt.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-*/targets/x86_64-linux/lib/stubs/libcuda.so
# - name: Build
# id: build
# run: |
# go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
# go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
# export PATH=$PATH:$GOPATH/bin
# export PATH=/usr/local/cuda/bin:$PATH
# sudo cp /lib64/ld-linux-aarch64.so.1 ld.so
# BACKEND_LIBS="./ld.so ./sources/go-piper/piper/build/fi/lib/libfmt.a ./sources/go-piper/piper-phonemize/pi/lib/libonnxruntime.so.1.14.1 ./sources/go-piper/piper-phonemize/pi/src/libespeak-ng/libespeak-ng.so /usr/lib/aarch64-linux-gnu/libdl.so.2 /usr/lib/aarch64-linux-gnu/librt.so.1 /usr/lib/aarch64-linux-gnu/libpthread.so.0 ./sources/go-piper/piper-phonemize/pi/lib/libpiper_phonemize.so.1 ./sources/go-piper/piper/build/si/lib/libspdlog.a ./sources/go-piper/espeak/ei/lib/libucd.so" \
# make -j4 dist
# - uses: actions/upload-artifact@v4
# with:
# name: LocalAI-linux-arm64
# path: release/
# - name: Release
# uses: softprops/action-gh-release@v2
# if: startsWith(github.ref, 'refs/tags/')
# with:
# files: |
# release/*
# - name: Setup tmate session if tests fail
# if: ${{ failure() }}
# uses: mxschmitt/action-tmate@v3.22
# with:
# detached: true
# connect-timeout-seconds: 180
# limit-access-to-actor: true
build-linux:
runs-on: ubuntu-latest
steps:

View File

@@ -67,18 +67,21 @@ jobs:
# You can test your matrix by printing the current Go version
- name: Display Go version
run: go version
- name: Proto Dependencies
run: |
# Install protoc
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
rm protoc.zip
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
PATH="$PATH:$HOME/go/bin" make protogen-go
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
sudo apt-get install -y libgmock-dev clang
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
@@ -94,9 +97,6 @@ jobs:
sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
export CUDACXX=/usr/local/cuda/bin/nvcc
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
# The python3-grpc-tools package in 22.04 is too old
pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
@@ -107,25 +107,10 @@ jobs:
make sources/go-piper && \
GO_TAGS="tts" make -C sources/go-piper piper.o && \
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
make backends/llama-cpp
env:
CUDA_VERSION: 12-4
- name: Cache grpc
id: cache-grpc
uses: actions/cache@v4
with:
path: grpc
key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
- name: Build grpc
if: steps.cache-grpc.outputs.cache-hit != 'true'
run: |
git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
cd grpc && sed -i "216i\ TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && cd cmake/build && \
cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
../.. && sudo make --jobs 5
- name: Install gRPC
run: |
cd grpc && cd cmake/build && sudo make --jobs 5 install
- name: Test
run: |
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
@@ -186,14 +171,9 @@ jobs:
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
PATH="$PATH:$HOME/go/bin" make protogen-go
- name: Build images
run: |
docker build --build-arg FFMPEG=true --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
- name: Test
run: |
PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
make run-e2e-aio
PATH="$PATH:$HOME/go/bin" make backends/llama-cpp docker-build-aio e2e-aio
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22
@@ -225,6 +205,14 @@ jobs:
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
pip install --user --no-cache-dir grpcio-tools==1.71.0 grpcio==1.71.0
go install github.com/GeertJohan/go.rice/rice@latest
- name: Build llama-cpp-darwin
run: |
make protogen-go
make build-api
bash scripts/build-llama-cpp-darwin.sh
ls -la build/darwin.tar
mv build/darwin.tar build/llama-cpp.tar
./local-ai backends install "ocifile://$PWD/build/llama-cpp.tar"
- name: Test
run: |
export C_INCLUDE_PATH=/usr/local/include
@@ -232,7 +220,8 @@ jobs:
export CC=/opt/homebrew/opt/llvm/bin/clang
# Used to run the newer GNUMake version from brew that supports --output-sync
export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
PATH="$PATH:$HOME/go/bin" make protogen-go
PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22

8
.gitignore vendored
View File

@@ -5,9 +5,11 @@ __pycache__/
*.o
get-sources
prepare-sources
/backend/cpp/llama/grpc-server
/backend/cpp/llama/llama.cpp
/backend/cpp/llama-cpp/grpc-server
/backend/cpp/llama-cpp/llama.cpp
/backend/cpp/llama-*
!backend/cpp/llama-cpp
/backends
*.log
@@ -56,4 +58,4 @@ docs/static/gallery.html
**/venv
# per-developer customization files for the development container
.devcontainer/customization/*
.devcontainer/customization/*

View File

@@ -25,6 +25,7 @@ ARG TARGETVARIANT
ENV BUILD_TYPE=${BUILD_TYPE}
RUN mkdir -p /run/localai
RUN echo "default" > /run/localai/capability
# Vulkan requirements
RUN <<EOT bash
@@ -299,11 +300,7 @@ COPY ./pkg/langchain ./pkg/langchain
RUN ls -l ./
RUN make backend-assets
RUN make prepare
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make grpcs; \
else \
make grpcs; \
fi
RUN make grpcs
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
# Adjustments to the build process should likely be made here.
@@ -316,11 +313,7 @@ COPY . .
## Build the binary
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
## Otherwise just run the normal build
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \
make build; \
fi
RUN make build
RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \

289
Makefile
View File

@@ -5,9 +5,6 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=d6fb3f6b49b27ef1c0f4cf5128e041f7e7dc03af
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
WHISPER_CPP_VERSION?=032697b9a850dc2615555e2a93a683cc3dd58559
@@ -16,10 +13,6 @@ WHISPER_CPP_VERSION?=032697b9a850dc2615555e2a93a683cc3dd58559
PIPER_REPO?=https://github.com/mudler/go-piper
PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
# bark.cpp
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=5d5be84f089ab9ea53b7a793f088d3fbf7247495
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
@@ -225,12 +218,6 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
endif
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
ifeq ($(ONNX_OS),linux)
@@ -261,23 +248,6 @@ endif
all: help
## bark.cpp
sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
cd sources/bark.cpp && \
git checkout $(BARKCPP_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/bark.cpp/build/libbark.a: sources/bark.cpp
cd sources/bark.cpp && \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) .. && \
cmake --build . --config Release
backend/go/bark-cpp/libbark.a: sources/bark.cpp/build/libbark.a
$(MAKE) -C backend/go/bark-cpp libbark.a
## go-piper
sources/go-piper:
mkdir -p sources/go-piper
@@ -333,7 +303,7 @@ sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
cd sources/whisper.cpp/build && cmake --build . --config Release
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/whisper.cpp
replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
@@ -365,10 +335,7 @@ clean: ## Remove build related file
rm -rf release/
rm -rf backend-assets/*
$(MAKE) -C backend/cpp/grpc clean
$(MAKE) -C backend/go/bark-cpp clean
$(MAKE) -C backend/cpp/llama clean
$(MAKE) -C backend/go/image/stablediffusion-ggml clean
rm -rf backend/cpp/llama-* || true
$(MAKE) dropreplace
$(MAKE) protogen-clean
rmdir pkg/grpc/proto || true
@@ -402,9 +369,6 @@ endif
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
rice append --exec $(BINARY_NAME)
build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
build-api:
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=p2p $(MAKE) build
@@ -412,18 +376,6 @@ backend-assets/lib:
mkdir -p backend-assets/lib
dist:
$(MAKE) backend-assets/grpc/llama-cpp-avx2
ifeq ($(DETECT_LIBS),true)
scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
endif
ifeq ($(OS),Darwin)
BUILD_TYPE=none $(MAKE) backend-assets/grpc/llama-cpp-fallback
else
$(MAKE) backend-assets/grpc/llama-cpp-cuda
$(MAKE) backend-assets/grpc/llama-cpp-hipblas
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
endif
GO_TAGS="tts p2p" $(MAKE) build
ifeq ($(DETECT_LIBS),true)
scripts/prepare-libs.sh backend-assets/grpc/piper
@@ -439,19 +391,6 @@ else
shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH).sha256
endif
dist-cross-linux-arm64:
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" GO_TAGS="p2p" \
STATIC=true $(MAKE) build
mkdir -p release
# if BUILD_ID is empty, then we don't append it to the binary name
ifeq ($(BUILD_ID),)
cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-arm64
shasum -a 256 release/$(BINARY_NAME)-$(OS)-arm64 > release/$(BINARY_NAME)-$(OS)-arm64.sha256
else
cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64
shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64 > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64.sha256
endif
osx-signed: build
codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
@@ -472,17 +411,47 @@ prepare-test: grpcs
cp -rf backend-assets core/http
cp tests/models_fixtures/* test-models
########################################################
## Tests
########################################################
## Test targets
test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests'
export GO_TAGS="tts debug"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama-gguf
$(MAKE) test-tts
$(MAKE) test-stablediffusion
backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build-api
./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
########################################################
## AIO tests
########################################################
docker-build-aio:
docker build --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test $(MAKE) docker-aio
e2e-aio:
LOCALAI_BACKEND_DIR=$(abspath ./backends) \
LOCALAI_MODELS_DIR=$(abspath ./models) \
LOCALAI_IMAGE_TAG=test \
LOCALAI_IMAGE=local-ai-aio \
$(MAKE) run-e2e-aio
run-e2e-aio: protogen-go
@echo 'Running e2e AIO tests'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
########################################################
## E2E tests
########################################################
prepare-e2e:
mkdir -p $(TEST_DIR)
cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
@@ -493,10 +462,6 @@ run-e2e-image:
ls -liah $(abspath ./tests/e2e-fixtures)
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
run-e2e-aio: protogen-go
@echo 'Running e2e AIO tests'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
test-e2e:
@echo 'Running e2e tests'
BUILD_TYPE=$(BUILD_TYPE) \
@@ -507,16 +472,20 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests)
########################################################
## Integration and unit tests
########################################################
test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-tts: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-stablediffusion: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-stores: backend-assets/grpc/local-store
@@ -528,6 +497,10 @@ test-container:
docker build --target requirements -t local-ai-test-container .
docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container
########################################################
## Help
########################################################
## Help:
help: ## Show this help.
@echo ''
@@ -540,6 +513,10 @@ help: ## Show this help.
else if (/^## .*$$/) {printf " ${CYAN}%s${RESET}\n", substr($$1,4)} \
}' $(MAKEFILE_LIST)
########################################################
## Backends
########################################################
.PHONY: protogen
protogen: protogen-go protogen-python
@@ -679,7 +656,7 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
mkdir -p backend-assets/espeak-ng-data
@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
backend-assets/grpc: protogen-go replace
backend-assets/grpc:
mkdir -p backend-assets/grpc
backend-assets/grpc/huggingface: backend-assets/grpc
@@ -688,128 +665,21 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/huggingface
endif
backend/cpp/llama/llama.cpp:
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
build-llama-cpp-grpc-server:
# Conditionally build grpc for the llama backend to use if needed
ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
$(MAKE) -C backend/cpp/grpc build
_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
LLAMA_VERSION=$(CPPLLAMA_VERSION) \
$(MAKE) -C backend/cpp/${VARIANT} grpc-server
else
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
endif
# This target is for manually building a variant with-auto detected flags
backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-cpp
$(MAKE) -C backend/cpp/llama-cpp purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx2
$(MAKE) -C backend/cpp/llama-avx2 purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx512
$(MAKE) -C backend/cpp/llama-avx512 purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx
$(MAKE) -C backend/cpp/llama-avx purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-fallback
$(MAKE) -C backend/cpp/llama-fallback purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-cuda
$(MAKE) -C backend/cpp/llama-cuda purge
$(info ${GREEN}I llama-cpp build info:cuda${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-hipblas
$(MAKE) -C backend/cpp/llama-hipblas purge
$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
$(MAKE) -C backend/cpp/llama-sycl_f16 purge
$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
$(MAKE) -C backend/cpp/llama-sycl_f32 purge
$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-grpc
$(MAKE) -C backend/cpp/llama-grpc purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/bark-cpp: backend/go/bark-cpp/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark-cpp/ LIBRARY_PATH=$(CURDIR)/backend/go/bark-cpp/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark-cpp/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/bark-cpp
endif
backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
backend-assets/grpc/piper: protogen-go replace sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/piper
endif
backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
backend-assets/grpc/silero-vad: protogen-go replace backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/silero-vad
endif
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
backend-assets/grpc/whisper: protogen-go replace sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
CGO_CXXFLAGS="$(CGO_CXXFLAGS_WHISPER)" \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
@@ -817,13 +687,13 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/whisper
endif
backend-assets/grpc/local-store: backend-assets/grpc
backend-assets/grpc/local-store: backend-assets/grpc protogen-go replace
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/local-store
endif
grpcs: prepare $(GRPC_BACKENDS)
grpcs: prepare protogen-go $(GRPC_BACKENDS)
DOCKER_IMAGE?=local-ai
DOCKER_AIO_IMAGE?=local-ai-aio
@@ -879,6 +749,59 @@ docker-image-intel-xpu:
--build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" \
--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
########################################################
## Backends
########################################################
backend-images:
mkdir -p backend-images
docker-build-llama-cpp:
docker build -t local-ai-backend:llama-cpp -f backend/Dockerfile.llama-cpp .
docker-build-bark-cpp:
docker build -t local-ai-backend:bark-cpp -f backend/Dockerfile.go --build-arg BACKEND=bark-cpp .
docker-save-llama-cpp: backend-images
docker save local-ai-backend:llama-cpp -o backend-images/llama-cpp.tar
docker-build-rerankers:
docker build -t local-ai-backend:rerankers -f backend/Dockerfile.python --build-arg BACKEND=rerankers .
docker-build-vllm:
docker build -t local-ai-backend:vllm -f backend/Dockerfile.python --build-arg BACKEND=vllm .
docker-build-transformers:
docker build -t local-ai-backend:transformers -f backend/Dockerfile.python --build-arg BACKEND=transformers .
docker-build-diffusers:
docker build -t local-ai-backend:diffusers -f backend/Dockerfile.python --build-arg BACKEND=diffusers .
docker-build-kokoro:
docker build -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro .
docker-build-faster-whisper:
docker build -t local-ai-backend:faster-whisper -f backend/Dockerfile.python --build-arg BACKEND=faster-whisper .
docker-build-coqui:
docker build -t local-ai-backend:coqui -f backend/Dockerfile.python --build-arg BACKEND=coqui .
docker-build-bark:
docker build -t local-ai-backend:bark -f backend/Dockerfile.python --build-arg BACKEND=bark .
docker-build-chatterbox:
docker build -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox .
docker-build-exllama2:
docker build -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 .
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-exllama2
########################################################
### END Backends
########################################################
.PHONY: swagger
swagger:
swag init -g core/http/app.go --output swagger

View File

@@ -17,9 +17,9 @@ ARG GO_VERSION=1.22.6
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ccache \
git ccache \
ca-certificates \
make \
make cmake \
curl unzip \
libssl-dev && \
apt-get clean && \
@@ -123,9 +123,10 @@ EOT
COPY . /LocalAI
RUN cd /LocalAI && make backend-assets/grpc/${BACKEND}
RUN cd /LocalAI && make protogen-go && make -C /LocalAI/backend/go/${BACKEND} build
FROM scratch
ARG BACKEND=rerankers
COPY --from=builder /LocalAI/backend-assets/grpc/${BACKEND} ./
COPY --from=builder /LocalAI/backend/go/${BACKEND}/${BACKEND} ./
COPY --from=builder /LocalAI/backend/go/${BACKEND}/run.sh ./

View File

@@ -0,0 +1,207 @@
ARG BASE_IMAGE=ubuntu:22.04
ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
# The grpc target does one thing, it builds and installs GRPC. This is in it's own layer so that it can be effectively cached by CI.
# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
FROM ${GRPC_BASE_IMAGE} AS grpc
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
ARG GRPC_VERSION=v1.65.0
ARG CMAKE_FROM_SOURCE=false
ARG CMAKE_VERSION=3.26.4
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
WORKDIR /build
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
build-essential curl libssl-dev \
git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install CMake (the version in 22.04 is too old)
RUN <<EOT bash
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
else
apt-get update && \
apt-get install -y \
cmake && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
fi
EOT
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
# and running make install in the target container
RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
mkdir -p /build/grpc/cmake/build && \
cd /build/grpc/cmake/build && \
sed -i "216i\ TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
make && \
make install && \
rm -rf /build
FROM ${BASE_IMAGE} AS builder
ARG BACKEND=rerankers
ARG BUILD_TYPE
ENV BUILD_TYPE=${BUILD_TYPE}
ARG CUDA_MAJOR_VERSION
ARG CUDA_MINOR_VERSION
ARG SKIP_DRIVERS=false
ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETARCH
ARG TARGETVARIANT
ARG GO_VERSION=1.22.6
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ccache git \
ca-certificates \
make \
curl unzip \
libssl-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Cuda
ENV PATH=/usr/local/cuda/bin:${PATH}
# HipBLAS requirements
ENV PATH=/opt/rocm/bin:${PATH}
# Vulkan requirements
RUN <<EOT bash
if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common pciutils wget gpg-agent && \
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt-get update && \
apt-get install -y \
vulkan-sdk && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
fi
EOT
# CuBLAS requirements
RUN <<EOT bash
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common pciutils
if [ "amd64" = "$TARGETARCH" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
fi
if [ "arm64" = "$TARGETARCH" ]; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
fi
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm -f cuda-keyring_1.1-1_all.deb && \
apt-get update && \
apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
fi
EOT
# If we are building with clblas support, we need the libraries for the builds
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
apt-get update && \
apt-get install -y --no-install-recommends \
libclblast-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* \
; fi
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
apt-get update && \
apt-get install -y --no-install-recommends \
hipblas-dev \
rocblas-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
# I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
# to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
ldconfig \
; fi
RUN echo "TARGETARCH: $TARGETARCH"
# We need protoc installed, and the version in 22.04 is too old. We will create one as part installing the GRPC build below
# but that will also being in a newer version of absl which stablediffusion cannot compile with. This version of protoc is only
# here so that we can generate the grpc code for the stablediffusion build
RUN <<EOT bash
if [ "amd64" = "$TARGETARCH" ]; then
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
rm protoc.zip
fi
if [ "arm64" = "$TARGETARCH" ]; then
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
rm protoc.zip
fi
EOT
# Install CMake (the version in 22.04 is too old)
RUN <<EOT bash
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
else
apt-get update && \
apt-get install -y \
cmake && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
fi
EOT
COPY --from=grpc /opt/grpc /usr/local
COPY . /LocalAI
## Otherwise just run the normal build
RUN <<EOT bash
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-fallback && \
make llama-cpp-grpc && make llama-cpp-rpc-server; \
else \
cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-avx && \
make llama-cpp-avx2 && \
make llama-cpp-avx512 && \
make llama-cpp-fallback && \
make llama-cpp-grpc && \
make llama-cpp-rpc-server; \
fi
EOT
# Copy libraries using a script to handle architecture differences
RUN make -C /LocalAI/backend/cpp/llama-cpp package
FROM scratch
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./

View File

@@ -0,0 +1,165 @@
LLAMA_VERSION?=e75ba4c0434eb759eb7ff74e034ebe729053e575
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=
BUILD_TYPE?=
NATIVE?=false
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DGGML_CUDA=ON
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas)
ROCM_HOME ?= /opt/rocm
ROCM_PATH ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
# GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
# AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
CMAKE_ARGS+=-DGGML_HIP=ON
# CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
else ifeq ($(OS),Darwin)
ifeq ($(BUILD_TYPE),)
BUILD_TYPE=metal
endif
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF
else
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
CMAKE_ARGS+=-DGGML_OPENMP=OFF
endif
TARGET+=--target ggml-metal
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
endif
INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
build-llama-cpp-grpc-server:
# Conditionally build grpc for the llama backend to use if needed
ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
$(MAKE) -C ../../grpc build
_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
LLAMA_VERSION=$(LLAMA_VERSION) \
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
else
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
endif
llama-cpp-avx2: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
llama-cpp-avx512: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
llama-cpp-avx: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
llama-cpp-fallback: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp
cd llama.cpp && \
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
git checkout -b build $(LLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
rebuild:
bash prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server
package:
bash package.sh
purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server
rm -rf grpc-server
clean: purge
rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
else
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
endif
cp llama.cpp/build/bin/grpc-server .

View File

@@ -0,0 +1,42 @@
#!/bin/bash
# Script to copy the appropriate libraries based on architecture
# This script is used in the final stage of the Dockerfile
set -e
CURDIR=$(dirname "$(realpath $0)")
# Create lib directory
mkdir -p $CURDIR/package/lib
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
cp -rfv $CURDIR/run.sh $CURDIR/package/
# Detect architecture and copy appropriate libraries
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
# x86_64 architecture
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
# ARM64 architecture
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
else
echo "Error: Could not detect architecture"
exit 1
fi
echo "Packaging completed successfully"
ls -liah $CURDIR/package/
ls -liah $CURDIR/package/lib/

61
backend/cpp/llama-cpp/run.sh Executable file
View File

@@ -0,0 +1,61 @@
#!/bin/bash
set -ex
# Get the absolute current dir where the script is located
CURDIR=$(dirname "$(realpath $0)")
cd /
echo "CPU info:"
grep -e "model\sname" /proc/cpuinfo | head -1
grep -e "flags" /proc/cpuinfo | head -1
BINARY=llama-cpp-fallback
if grep -q -e "\savx\s" /proc/cpuinfo ; then
echo "CPU: AVX found OK"
if [ -e $CURDIR/llama-cpp-avx ]; then
BINARY=llama-cpp-avx
fi
fi
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
echo "CPU: AVX2 found OK"
if [ -e $CURDIR/llama-cpp-avx2 ]; then
BINARY=llama-cpp-avx2
fi
fi
# Check avx 512
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
echo "CPU: AVX512F found OK"
if [ -e $CURDIR/llama-cpp-avx512 ]; then
BINARY=llama-cpp-avx512
fi
fi
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
if [ -e $CURDIR/llama-cpp-grpc ]; then
BINARY=llama-cpp-grpc
fi
fi
# Extend ld library path with the dir where this script is located/lib
if [ "$(uname)" == "Darwin" ]; then
DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH
else
LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
fi
# If there is a lib/ld.so, use it
if [ -f $CURDIR/lib/ld.so ]; then
echo "Using lib/ld.so"
echo "Using binary: $BINARY"
$CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
fi
echo "Using binary: $BINARY"
exec $CURDIR/$BINARY "$@"
# In case we fail execing, just run fallback
exec $CURDIR/llama-cpp-fallback "$@"

View File

@@ -1,87 +0,0 @@
LLAMA_VERSION?=
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=
BUILD_TYPE?=
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DGGML_CUDA=ON
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIP=ON
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
# But if it's OSX without metal, disable it here
else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF
else
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
TARGET+=--target ggml-metal
endif
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
endif
llama.cpp:
mkdir -p llama.cpp
cd llama.cpp && \
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
git checkout -b build $(LLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
rebuild:
bash prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server
purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server
rm -rf grpc-server
clean: purge
rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
else
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
endif
cp llama.cpp/build/bin/grpc-server .

View File

@@ -3,23 +3,46 @@ LIBRARY_PATH := $(abspath ./)
AR?=ar
CMAKE_ARGS?=-DGGML_NATIVE=OFF
BUILD_TYPE?=
GOCMD=go
# keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
LDFLAGS = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
CXXFLAGS = -I. -I$(INCLUDE_PATH)/sources/bark.cpp/examples -I$(INCLUDE_PATH)/sources/bark.cpp/encodec.cpp/ggml/include -I$(INCLUDE_PATH)/sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
LDFLAGS = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/sources/bark.cpp/build/examples -lbark -lstdc++ -lm
# bark.cpp
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=5d5be84f089ab9ea53b7a793f088d3fbf7247495
# warnings
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
## bark.cpp
sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
cd sources/bark.cpp && \
git checkout $(BARKCPP_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/bark.cpp/build/libbark.a: sources/bark.cpp
cd sources/bark.cpp && \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) .. && \
cmake --build . --config Release
gobark.o:
$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
libbark.a: gobark.o
cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
libbark.a: sources/bark.cpp/build/libbark.a gobark.o
cp $(INCLUDE_PATH)/sources/bark.cpp/build/libbark.a ./
$(AR) rcs libbark.a gobark.o
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
bark-cpp: libbark.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH="$(CURDIR)" LIBRARY_PATH=$(CURDIR) \
$(GOCMD) build -v -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o bark-cpp ./
build: bark-cpp
clean:
rm -f gobark.o libbark.a

View File

@@ -1,7 +1,7 @@
package main
// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
// #cgo CXXFLAGS: -I${SRCDIR}/sources/bark.cpp/ -I${SRCDIR}/sources/bark.cpp/encodec.cpp -I${SRCDIR}/sources/bark.cpp/encodec.cpp/ggml/include -I${SRCDIR}/sources/bark.cpp/examples -I${SRCDIR}/sources/bark.cpp/spm-headers
// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/sources/bark.cpp/build/examples -L${SRCDIR}/sources/bark.cpp/build/encodec.cpp/ggml/src/ -L${SRCDIR}/sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon -lggml -lgomp
// #include <gobark.h>
// #include <stdlib.h>
import "C"

View File

@@ -1,3 +1,6 @@
#!/bin/bash
set -ex
exec ./bark-cpp
CURDIR=$(dirname "$(realpath $0)")
exec $CURDIR/bark-cpp "$@"

View File

@@ -1,5 +1,30 @@
---
## vLLM
## metas
- &llamacpp
name: "llama-cpp"
alias: "llama-cpp"
license: mit
icon: https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png
description: |
LLM inference in C/C++
urls:
- https://github.com/ggerganov/llama.cpp
tags:
- text-to-text
- LLM
- CPU
- GPU
- Metal
- CUDA
- HIP
capabilities:
default: "cpu-llama-cpp"
nvidia: "cuda12-llama-cpp"
intel: "intel-sycl-f16-llama-cpp"
amd: "rocm-llama-cpp"
metal: "metal-llama-cpp"
nvidia-l4t: "nvidia-l4t-arm64-llama-cpp"
darwin-x86: "darwin-x86-llama-cpp"
- &vllm
name: "vllm"
license: apache-2.0
@@ -32,6 +57,229 @@
nvidia: "cuda12-vllm"
amd: "rocm-vllm"
intel: "intel-sycl-f16-vllm"
- &rerankers
name: "rerankers"
alias: "rerankers"
capabilities:
nvidia: "cuda12-rerankers"
intel: "intel-sycl-f16-rerankers"
amd: "rocm-rerankers"
- &transformers
name: "transformers"
icon: https://camo.githubusercontent.com/26569a27b8a30a488dd345024b71dbc05da7ff1b2ba97bb6080c9f1ee0f26cc7/68747470733a2f2f68756767696e67666163652e636f2f64617461736574732f68756767696e67666163652f646f63756d656e746174696f6e2d696d616765732f7265736f6c76652f6d61696e2f7472616e73666f726d6572732f7472616e73666f726d6572735f61735f615f6d6f64656c5f646566696e6974696f6e2e706e67
alias: "transformers"
license: apache-2.0
description: |
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer vision, audio, video, and multimodal model, for both inference and training.
It centralizes the model definition so that this definition is agreed upon across the ecosystem. transformers is the pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), inference engines (vLLM, SGLang, TGI, ...), and adjacent modeling libraries (llama.cpp, mlx, ...) which leverage the model definition from transformers.
urls:
- https://github.com/huggingface/transformers
tags:
- text-to-text
- multimodal
capabilities:
nvidia: "cuda12-transformers"
intel: "intel-sycl-f16-transformers"
amd: "rocm-transformers"
- &diffusers
icon: https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/en/imgs/diffusers_library.jpg
description: |
🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or training your own diffusion models, 🤗 Diffusers is a modular toolbox that supports both.
urls:
- https://github.com/huggingface/diffusers
tags:
- image-generation
- video-generation
- diffusion-models
license: apache-2.0
alias: "diffusers"
capabilities:
nvidia: "cuda12-diffusers"
intel: "intel-sycl-f32-diffusers"
amd: "rocm-diffusers"
- &exllama2
name: "exllama2"
urls:
- https://github.com/turboderp-org/exllamav2
tags:
- text-to-text
- LLM
- EXL2
license: MIT
description: |
ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs.
alias: "exllama2"
capabilities:
nvidia: "cuda12-exllama2"
intel: "intel-sycl-f32-exllama2"
amd: "rocm-exllama2"
- &faster-whisper
icon: https://avatars.githubusercontent.com/u/1520500?s=200&v=4
description: |
faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models.
This implementation is up to 4 times faster than openai/whisper for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
urls:
- https://github.com/SYSTRAN/faster-whisper
tags:
- speech-to-text
- Whisper
license: MIT
name: "faster-whisper"
capabilities:
nvidia: "cuda12-faster-whisper"
intel: "intel-sycl-f32-faster-whisper"
amd: "rocm-faster-whisper"
- &kokoro
icon: https://avatars.githubusercontent.com/u/166769057?v=4
description: |
Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.
urls:
- https://huggingface.co/hexgrad/Kokoro-82M
- https://github.com/hexgrad/kokoro
tags:
- text-to-speech
- TTS
- LLM
license: apache-2.0
alias: "kokoro"
name: "kokoro"
capabilities:
nvidia: "cuda12-kokoro"
intel: "intel-sycl-f32-kokoro"
amd: "rocm-kokoro"
- &coqui
urls:
- https://github.com/idiap/coqui-ai-TTS
description: |
🐸 Coqui TTS is a library for advanced Text-to-Speech generation.
🚀 Pretrained models in +1100 languages.
🛠️ Tools for training new models and fine-tuning existing models in any language.
📚 Utilities for dataset analysis and curation.
tags:
- text-to-speech
- TTS
license: mpl-2.0
name: "coqui"
alias: "coqui"
capabilities:
nvidia: "cuda12-coqui"
intel: "intel-sycl-f32-coqui"
amd: "rocm-coqui"
icon: https://avatars.githubusercontent.com/u/1338804?s=200&v=4
- &bark
urls:
- https://github.com/suno-ai/bark
description: |
Bark is a transformer-based text-to-audio model created by Suno. Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying. To support the research community, we are providing access to pretrained model checkpoints, which are ready for inference and available for commercial use.
tags:
- text-to-speech
- TTS
license: MIT
name: "bark"
alias: "bark"
capabilities:
cuda: "cuda12-bark"
intel: "intel-sycl-f32-bark"
rocm: "rocm-bark"
icon: https://avatars.githubusercontent.com/u/99442120?s=200&v=4
- &barkcpp
urls:
- https://github.com/PABannier/bark.cpp
description: |
With bark.cpp, our goal is to bring real-time realistic multilingual text-to-speech generation to the community.
Plain C/C++ implementation without dependencies
AVX, AVX2 and AVX512 for x86 architectures
CPU and GPU compatible backends
Mixed F16 / F32 precision
4-bit, 5-bit and 8-bit integer quantization
Metal and CUDA backends
Models supported
Bark Small
Bark Large
tags:
- text-to-speech
- TTS
license: MIT
icon: https://github.com/PABannier/bark.cpp/raw/main/assets/banner.png
name: "bark-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-bark-cpp"
alias: "bark-cpp"
- &chatterbox
urls:
- https://github.com/resemble-ai/chatterbox
description: |
Resemble AI's first production-grade open source TTS model. Licensed under MIT, Chatterbox has been benchmarked against leading closed-source systems like ElevenLabs, and is consistently preferred in side-by-side evaluations.
Whether you're working on memes, videos, games, or AI agents, Chatterbox brings your content to life. It's also the first open source TTS model to support emotion exaggeration control, a powerful feature that makes your voices stand out.
tags:
- text-to-speech
- TTS
license: MIT
icon: https://private-user-images.githubusercontent.com/660224/448166653-bd8c5f03-e91d-4ee5-b680-57355da204d1.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NTAxOTE0MDAsIm5iZiI6MTc1MDE5MTEwMCwicGF0aCI6Ii82NjAyMjQvNDQ4MTY2NjUzLWJkOGM1ZjAzLWU5MWQtNGVlNS1iNjgwLTU3MzU1ZGEyMDRkMS5wbmc_WC1BbXotQWxnb3JpdGhtPUFXUzQtSE1BQy1TSEEyNTYmWC1BbXotQ3JlZGVudGlhbD1BS0lBVkNPRFlMU0E1M1BRSzRaQSUyRjIwMjUwNjE3JTJGdXMtZWFzdC0xJTJGczMlMkZhd3M0X3JlcXVlc3QmWC1BbXotRGF0ZT0yMDI1MDYxN1QyMDExNDBaJlgtQW16LUV4cGlyZXM9MzAwJlgtQW16LVNpZ25hdHVyZT1hMmI1NGY3OGFiZTlhNGFkNTVlYTY4NTIwMWEzODRiZGE4YzdhNGQ5MGNhNzE3MDYyYTA2NDIxYTkyYzhiODkwJlgtQW16LVNpZ25lZEhlYWRlcnM9aG9zdCJ9.mR9kM9xX0TdzPuSpuspCllHYQiq79dFQ2rtuNvjrl6w
name: "chatterbox"
capabilities:
nvidia: "cuda12-chatterbox"
## llama-cpp
- !!merge <<: *llamacpp
name: "darwin-x86-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-darwin-x86-llama-cpp"
- !!merge <<: *llamacpp
name: "darwin-x86-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-darwin-x86-llama-cpp"
- !!merge <<: *llamacpp
name: "nvidia-l4t-arm64-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp"
- !!merge <<: *llamacpp
name: "nvidia-l4t-arm64-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-llama-cpp"
- !!merge <<: *llamacpp
name: "cpu-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp"
- !!merge <<: *llamacpp
name: "cpu-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp"
- !!merge <<: *llamacpp
name: "cuda11-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-llama-cpp"
- !!merge <<: *llamacpp
name: "cuda12-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp"
- !!merge <<: *llamacpp
name: "rocm-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp"
- !!merge <<: *llamacpp
name: "intel-sycl-f32-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp"
- !!merge <<: *llamacpp
name: "intel-sycl-f16-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp"
- !!merge <<: *llamacpp
name: "metal-llama-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp"
- !!merge <<: *llamacpp
name: "metal-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-llama-cpp"
- !!merge <<: *llamacpp
name: "cuda11-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-llama-cpp"
- !!merge <<: *llamacpp
name: "cuda12-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-llama-cpp"
- !!merge <<: *llamacpp
name: "rocm-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-llama-cpp"
- !!merge <<: *llamacpp
name: "intel-sycl-f32-llama-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-llama-cpp"
- !!merge <<: *llamacpp
name: "intel-sycl-f16-vllm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-vllm"
# vllm
- !!merge <<: *vllm
name: "vllm-development"
capabilities:
@@ -69,13 +317,6 @@
name: "intel-sycl-f16-vllm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-vllm"
## Rerankers
- &rerankers
name: "rerankers"
alias: "rerankers"
capabilities:
nvidia: "cuda12-rerankers"
intel: "intel-sycl-f16-rerankers"
amd: "rocm-rerankers"
- !!merge <<: *rerankers
name: "rerankers-development"
capabilities:
@@ -113,23 +354,6 @@
name: "intel-sycl-f16-rerankers-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-rerankers"
## Transformers
- &transformers
name: "transformers"
icon: https://camo.githubusercontent.com/26569a27b8a30a488dd345024b71dbc05da7ff1b2ba97bb6080c9f1ee0f26cc7/68747470733a2f2f68756767696e67666163652e636f2f64617461736574732f68756767696e67666163652f646f63756d656e746174696f6e2d696d616765732f7265736f6c76652f6d61696e2f7472616e73666f726d6572732f7472616e73666f726d6572735f61735f615f6d6f64656c5f646566696e6974696f6e2e706e67
alias: "transformers"
license: apache-2.0
description: |
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer vision, audio, video, and multimodal model, for both inference and training.
It centralizes the model definition so that this definition is agreed upon across the ecosystem. transformers is the pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), inference engines (vLLM, SGLang, TGI, ...), and adjacent modeling libraries (llama.cpp, mlx, ...) which leverage the model definition from transformers.
urls:
- https://github.com/huggingface/transformers
tags:
- text-to-text
- multimodal
capabilities:
nvidia: "cuda12-transformers"
intel: "intel-sycl-f16-transformers"
amd: "rocm-transformers"
- !!merge <<: *transformers
name: "transformers-development"
capabilities:
@@ -167,22 +391,6 @@
name: "intel-sycl-f16-transformers-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-transformers"
## Diffusers
- &diffusers
icon: https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/en/imgs/diffusers_library.jpg
description: |
🤗 Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you're looking for a simple inference solution or training your own diffusion models, 🤗 Diffusers is a modular toolbox that supports both.
urls:
- https://github.com/huggingface/diffusers
tags:
- image-generation
- video-generation
- diffusion-models
license: apache-2.0
alias: "diffusers"
capabilities:
nvidia: "cuda12-diffusers"
intel: "intel-sycl-f32-diffusers"
amd: "rocm-diffusers"
- !!merge <<: *diffusers
name: "diffusers-development"
capabilities:
@@ -214,22 +422,6 @@
name: "intel-sycl-f32-diffusers-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-diffusers"
## exllama2
- &exllama2
name: "exllama2"
urls:
- https://github.com/turboderp-org/exllamav2
tags:
- text-to-text
- LLM
- EXL2
license: MIT
description: |
ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs.
alias: "exllama2"
capabilities:
nvidia: "cuda12-exllama2"
intel: "intel-sycl-f32-exllama2"
amd: "rocm-exllama2"
- !!merge <<: *exllama2
name: "exllama2-development"
capabilities:
@@ -249,24 +441,6 @@
name: "cuda12-exllama2-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-exllama2"
## kokoro
- &kokoro
icon: https://avatars.githubusercontent.com/u/166769057?v=4
description: |
Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.
urls:
- https://huggingface.co/hexgrad/Kokoro-82M
- https://github.com/hexgrad/kokoro
tags:
- text-to-speech
- TTS
- LLM
license: apache-2.0
alias: "kokoro"
name: "kokoro"
capabilities:
nvidia: "cuda12-kokoro"
intel: "intel-sycl-f32-kokoro"
amd: "rocm-kokoro"
- !!merge <<: *kokoro
name: "kokoro-development"
capabilities:
@@ -304,22 +478,6 @@
name: "rocm-kokoro"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-kokoro"
## faster-whisper
- &faster-whisper
icon: https://avatars.githubusercontent.com/u/1520500?s=200&v=4
description: |
faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models.
This implementation is up to 4 times faster than openai/whisper for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
urls:
- https://github.com/SYSTRAN/faster-whisper
tags:
- speech-to-text
- Whisper
license: MIT
name: "faster-whisper"
capabilities:
nvidia: "cuda12-faster-whisper"
intel: "intel-sycl-f32-faster-whisper"
amd: "rocm-faster-whisper"
- !!merge <<: *faster-whisper
name: "faster-whisper-development"
capabilities:
@@ -348,28 +506,7 @@
name: "sycl-f16-faster-whisper-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-faster-whisper"
## coqui
- &coqui
urls:
- https://github.com/idiap/coqui-ai-TTS
description: |
🐸 Coqui TTS is a library for advanced Text-to-Speech generation.
🚀 Pretrained models in +1100 languages.
🛠️ Tools for training new models and fine-tuning existing models in any language.
📚 Utilities for dataset analysis and curation.
tags:
- text-to-speech
- TTS
license: mpl-2.0
name: "coqui"
alias: "coqui"
capabilities:
nvidia: "cuda12-coqui"
intel: "intel-sycl-f32-coqui"
amd: "rocm-coqui"
icon: https://avatars.githubusercontent.com/u/1338804?s=200&v=4
- !!merge <<: *coqui
name: "coqui-development"
capabilities:
@@ -407,22 +544,6 @@
name: "rocm-coqui"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-coqui"
## bark
- &bark
urls:
- https://github.com/suno-ai/bark
description: |
Bark is a transformer-based text-to-audio model created by Suno. Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying. To support the research community, we are providing access to pretrained model checkpoints, which are ready for inference and available for commercial use.
tags:
- text-to-speech
- TTS
license: MIT
name: "bark"
alias: "bark"
capabilities:
cuda: "cuda12-bark"
intel: "intel-sycl-f32-bark"
rocm: "rocm-bark"
icon: https://avatars.githubusercontent.com/u/99442120?s=200&v=4
- !!merge <<: *bark
name: "bark-development"
capabilities:
@@ -459,50 +580,11 @@
- !!merge <<: *bark
name: "cuda12-bark-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-bark"
- &barkcpp
urls:
- https://github.com/PABannier/bark.cpp
description: |
With bark.cpp, our goal is to bring real-time realistic multilingual text-to-speech generation to the community.
Plain C/C++ implementation without dependencies
AVX, AVX2 and AVX512 for x86 architectures
CPU and GPU compatible backends
Mixed F16 / F32 precision
4-bit, 5-bit and 8-bit integer quantization
Metal and CUDA backends
Models supported
Bark Small
Bark Large
tags:
- text-to-speech
- TTS
license: MIT
icon: https://github.com/PABannier/bark.cpp/raw/main/assets/banner.png
name: "bark-cpp"
uri: "quay.io/go-skynet/local-ai-backends:latest-bark-cpp"
alias: "bark-cpp"
- !!merge <<: *barkcpp
name: "bark-cpp-development"
uri: "quay.io/go-skynet/local-ai-backends:master-bark-cpp"
alias: "bark-cpp"
## chatterbox
- &chatterbox
urls:
- https://github.com/resemble-ai/chatterbox
description: |
Resemble AI's first production-grade open source TTS model. Licensed under MIT, Chatterbox has been benchmarked against leading closed-source systems like ElevenLabs, and is consistently preferred in side-by-side evaluations.
Whether you're working on memes, videos, games, or AI agents, Chatterbox brings your content to life. It's also the first open source TTS model to support emotion exaggeration control, a powerful feature that makes your voices stand out.
tags:
- text-to-speech
- TTS
license: MIT
icon: https://private-user-images.githubusercontent.com/660224/448166653-bd8c5f03-e91d-4ee5-b680-57355da204d1.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NTAxOTE0MDAsIm5iZiI6MTc1MDE5MTEwMCwicGF0aCI6Ii82NjAyMjQvNDQ4MTY2NjUzLWJkOGM1ZjAzLWU5MWQtNGVlNS1iNjgwLTU3MzU1ZGEyMDRkMS5wbmc_WC1BbXotQWxnb3JpdGhtPUFXUzQtSE1BQy1TSEEyNTYmWC1BbXotQ3JlZGVudGlhbD1BS0lBVkNPRFlMU0E1M1BRSzRaQSUyRjIwMjUwNjE3JTJGdXMtZWFzdC0xJTJGczMlMkZhd3M0X3JlcXVlc3QmWC1BbXotRGF0ZT0yMDI1MDYxN1QyMDExNDBaJlgtQW16LUV4cGlyZXM9MzAwJlgtQW16LVNpZ25hdHVyZT1hMmI1NGY3OGFiZTlhNGFkNTVlYTY4NTIwMWEzODRiZGE4YzdhNGQ5MGNhNzE3MDYyYTA2NDIxYTkyYzhiODkwJlgtQW16LVNpZ25lZEhlYWRlcnM9aG9zdCJ9.mR9kM9xX0TdzPuSpuspCllHYQiq79dFQ2rtuNvjrl6w
name: "chatterbox"
capabilities:
nvidia: "cuda12-chatterbox"
- !!merge <<: *chatterbox
name: "chatterbox-development"
capabilities:

View File

@@ -3,10 +3,12 @@ package worker
import (
"fmt"
"os"
"path/filepath"
"strings"
"syscall"
cliContext "github.com/mudler/LocalAI/core/cli/context"
"github.com/mudler/LocalAI/core/gallery"
"github.com/mudler/LocalAI/pkg/assets"
"github.com/mudler/LocalAI/pkg/library"
"github.com/rs/zerolog/log"
@@ -16,6 +18,34 @@ type LLamaCPP struct {
WorkerFlags `embed:""`
}
func findLLamaCPPBackend(backendSystemPath string) (string, error) {
backends, err := gallery.ListSystemBackends(backendSystemPath)
if err != nil {
log.Warn().Msgf("Failed listing system backends: %s", err)
return "", err
}
log.Debug().Msgf("System backends: %v", backends)
backendPath := ""
for b, path := range backends {
if b == "llama-cpp" {
backendPath = filepath.Dir(path)
break
}
}
if backendPath == "" {
return "", fmt.Errorf("llama-cpp backend not found")
}
grpcProcess := filepath.Join(
backendPath,
"grpc-server",
)
return grpcProcess, nil
}
func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
// Extract files from the embedded FS
err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
@@ -28,14 +58,14 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
return fmt.Errorf("usage: local-ai worker llama-cpp-rpc -- <llama-rpc-server-args>")
}
grpcProcess := assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
)
grpcProcess, err := findLLamaCPPBackend(r.BackendAssetsPath)
if err != nil {
return err
}
args := strings.Split(r.ExtraLLamaCPPArgs, " ")
args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
args = append([]string{grpcProcess}, args...)
return syscall.Exec(
grpcProcess,

View File

@@ -71,11 +71,12 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
for {
log.Info().Msgf("Starting llama-cpp-rpc-server on '%s:%d'", address, port)
grpcProcess := assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
)
grpcProcess, err := findLLamaCPPBackend(r.BackendAssetsPath)
if err != nil {
log.Error().Err(err).Msg("Failed to find llama-cpp-rpc-server")
return
}
var extraArgs []string
if r.ExtraLLamaCPPArgs != "" {

View File

@@ -1,6 +1,9 @@
package gallery
import "github.com/mudler/LocalAI/core/config"
import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/system"
)
// BackendMetadata represents the metadata stored in a JSON file for each installed backend
type BackendMetadata struct {
@@ -23,6 +26,19 @@ type GalleryBackend struct {
CapabilitiesMap map[string]string `json:"capabilities,omitempty" yaml:"capabilities,omitempty"`
}
func (backend *GalleryBackend) FindBestBackendFromMeta(systemState *system.SystemState, backends GalleryElements[*GalleryBackend]) *GalleryBackend {
if systemState == nil {
return nil
}
realBackend := backend.CapabilitiesMap[systemState.Capability(backend.CapabilitiesMap)]
if realBackend == "" {
return nil
}
return backends.FindByName(realBackend)
}
type GalleryBackends []*GalleryBackend
func (m *GalleryBackend) SetGallery(gallery config.Gallery) {

View File

@@ -57,19 +57,6 @@ func writeBackendMetadata(backendPath string, metadata *BackendMetadata) error {
return nil
}
func findBestBackendFromMeta(backend *GalleryBackend, systemState *system.SystemState, backends GalleryElements[*GalleryBackend]) *GalleryBackend {
if systemState == nil {
return nil
}
realBackend := backend.CapabilitiesMap[systemState.Capability()]
if realBackend == "" {
return nil
}
return backends.FindByName(realBackend)
}
// Installs a model from the gallery
func InstallBackendFromGallery(galleries []config.Gallery, systemState *system.SystemState, name string, basePath string, downloadStatus func(string, string, string, float64), force bool) error {
if !force {
@@ -103,7 +90,7 @@ func InstallBackendFromGallery(galleries []config.Gallery, systemState *system.S
log.Debug().Interface("systemState", systemState).Str("name", name).Msg("Backend is a meta backend")
// Then, let's try to find the best backend based on the capabilities map
bestBackend := findBestBackendFromMeta(backend, systemState, backends)
bestBackend := backend.FindBestBackendFromMeta(systemState, backends)
if bestBackend == nil {
return fmt.Errorf("no backend found with capabilities %q", backend.CapabilitiesMap)
}
@@ -283,6 +270,7 @@ func RegisterBackends(basePath string, modelLoader *model.ModelLoader) error {
}
for name, runFile := range backends {
log.Debug().Str("name", name).Str("runFile", runFile).Msg("Registering backend")
modelLoader.SetExternalBackend(name, runFile)
}

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"os"
"path/filepath"
"runtime"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/system"
@@ -98,6 +99,7 @@ var _ = Describe("Gallery Backends", func() {
})
It("should find best backend from meta based on system capabilities", func() {
metaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "meta-backend",
@@ -106,6 +108,7 @@ var _ = Describe("Gallery Backends", func() {
"nvidia": "nvidia-backend",
"amd": "amd-backend",
"intel": "intel-backend",
"metal": "metal-backend",
},
}
@@ -123,25 +126,43 @@ var _ = Describe("Gallery Backends", func() {
URI: testImage,
}
backends := GalleryElements[*GalleryBackend]{nvidiaBackend, amdBackend}
metalBackend := &GalleryBackend{
Metadata: Metadata{
Name: "metal-backend",
},
URI: testImage,
}
// Test with NVIDIA system state
nvidiaSystemState := &system.SystemState{GPUVendor: "nvidia"}
bestBackend := findBestBackendFromMeta(metaBackend, nvidiaSystemState, backends)
Expect(bestBackend).To(Equal(nvidiaBackend))
backends := GalleryElements[*GalleryBackend]{nvidiaBackend, amdBackend, metalBackend}
// Test with AMD system state
amdSystemState := &system.SystemState{GPUVendor: "amd"}
bestBackend = findBestBackendFromMeta(metaBackend, amdSystemState, backends)
Expect(bestBackend).To(Equal(amdBackend))
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
metal := &system.SystemState{}
bestBackend := metaBackend.FindBestBackendFromMeta(metal, backends)
Expect(bestBackend).To(Equal(metalBackend))
// Test with unsupported GPU vendor
unsupportedSystemState := &system.SystemState{GPUVendor: "unsupported"}
bestBackend = findBestBackendFromMeta(metaBackend, unsupportedSystemState, backends)
Expect(bestBackend).To(BeNil())
} else {
// Test with NVIDIA system state
nvidiaSystemState := &system.SystemState{GPUVendor: "nvidia"}
bestBackend := metaBackend.FindBestBackendFromMeta(nvidiaSystemState, backends)
Expect(bestBackend).To(Equal(nvidiaBackend))
// Test with AMD system state
amdSystemState := &system.SystemState{GPUVendor: "amd"}
bestBackend = metaBackend.FindBestBackendFromMeta(amdSystemState, backends)
Expect(bestBackend).To(Equal(amdBackend))
// Test with unsupported GPU vendor
unsupportedSystemState := &system.SystemState{GPUVendor: "unsupported"}
bestBackend = metaBackend.FindBestBackendFromMeta(unsupportedSystemState, backends)
Expect(bestBackend).To(BeNil())
}
})
It("should handle meta backend deletion correctly", func() {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
Skip("Skipping test on darwin/arm64")
}
metaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "meta-backend",
@@ -207,6 +228,9 @@ var _ = Describe("Gallery Backends", func() {
})
It("should handle meta backend deletion correctly with aliases", func() {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
Skip("Skipping test on darwin/arm64")
}
metaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "meta-backend",
@@ -276,6 +300,9 @@ var _ = Describe("Gallery Backends", func() {
})
It("should handle meta backend deletion correctly with aliases pointing to the same backend", func() {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
Skip("Skipping test on darwin/arm64")
}
metaBackend := &GalleryBackend{
Metadata: Metadata{
Name: "meta-backend",
@@ -401,6 +428,9 @@ var _ = Describe("Gallery Backends", func() {
})
It("should create alias file when specified", func() {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
Skip("Skipping test on darwin/arm64")
}
backend := GalleryBackend{
Metadata: Metadata{
Name: "test-backend",

View File

@@ -295,6 +295,8 @@ var _ = Describe("API test", func() {
tmpdir, err = os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
backendPath := os.Getenv("BACKENDS_PATH")
modelDir = filepath.Join(tmpdir, "models")
err = os.Mkdir(modelDir, 0750)
Expect(err).ToNot(HaveOccurred())
@@ -337,6 +339,7 @@ var _ = Describe("API test", func() {
config.WithContext(c),
config.WithGalleries(galleries),
config.WithModelPath(modelDir),
config.WithBackendsPath(backendPath),
config.WithApiKeys([]string{apiKey}),
config.WithBackendAssets(backendAssets),
config.WithBackendAssetsOutput(backendAssetsDir))...)
@@ -517,6 +520,9 @@ var _ = Describe("API test", func() {
BeforeEach(func() {
var err error
tmpdir, err = os.MkdirTemp("", "")
backendPath := os.Getenv("BACKENDS_PATH")
Expect(err).ToNot(HaveOccurred())
modelDir = filepath.Join(tmpdir, "models")
backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
@@ -540,6 +546,7 @@ var _ = Describe("API test", func() {
append(commonOpts,
config.WithContext(c),
config.WithGeneratedContentDir(tmpdir),
config.WithBackendsPath(backendPath),
config.WithGalleries(galleries),
config.WithModelPath(modelDir),
config.WithBackendAssets(backendAssets),
@@ -737,6 +744,7 @@ var _ = Describe("API test", func() {
Context("API query", func() {
BeforeEach(func() {
modelPath := os.Getenv("MODELS_PATH")
backendPath := os.Getenv("BACKENDS_PATH")
c, cancel = context.WithCancel(context.Background())
var err error
@@ -745,6 +753,7 @@ var _ = Describe("API test", func() {
append(commonOpts,
config.WithExternalBackend("transformers", os.Getenv("HUGGINGFACE_GRPC")),
config.WithContext(c),
config.WithBackendsPath(backendPath),
config.WithModelPath(modelPath),
)...)
Expect(err).ToNot(HaveOccurred())
@@ -956,6 +965,7 @@ var _ = Describe("API test", func() {
Context("Config file", func() {
BeforeEach(func() {
modelPath := os.Getenv("MODELS_PATH")
backendPath := os.Getenv("BACKENDS_PATH")
c, cancel = context.WithCancel(context.Background())
var err error
@@ -963,6 +973,7 @@ var _ = Describe("API test", func() {
append(commonOpts,
config.WithContext(c),
config.WithModelPath(modelPath),
config.WithBackendsPath(backendPath),
config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
)
Expect(err).ToNot(HaveOccurred())

View File

@@ -2,6 +2,7 @@ package system
import (
"os"
"runtime"
"strings"
"github.com/mudler/LocalAI/pkg/xsysinfo"
@@ -12,7 +13,26 @@ type SystemState struct {
GPUVendor string
}
func (s *SystemState) Capability() string {
const (
defaultCapability = "default"
nvidiaL4T = "nvidia-l4t"
darwinX86 = "darwin-x86"
metal = "metal"
)
func (s *SystemState) Capability(capMap map[string]string) string {
reportedCapability := s.getSystemCapabilities()
// Check if the reported capability is in the map
if _, exists := capMap[reportedCapability]; exists {
return reportedCapability
}
// Otherwise, return the default capability (catch-all)
return defaultCapability
}
func (s *SystemState) getSystemCapabilities() string {
if os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY") != "" {
return os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY")
}
@@ -32,6 +52,27 @@ func (s *SystemState) Capability() string {
}
}
// If we are on mac and arm64, we will return metal
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return metal
}
// If we are on mac and x86, we will return darwin-x86
if runtime.GOOS == "darwin" && runtime.GOARCH == "amd64" {
return darwinX86
}
// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
if s.GPUVendor == "nvidia" {
return nvidiaL4T
}
}
if s.GPUVendor == "" {
return defaultCapability
}
return s.GPUVendor
}

4
go.sum
View File

@@ -177,6 +177,8 @@ github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7z
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20240626202019-c118733a29ad h1:dQ93Vd6i25o+zH9vvnZ8mu7jtJQ6jT3D+zE3V8Q49n0=
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20240626202019-c118733a29ad/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/gliderlabs/ssh v0.1.1/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0=
@@ -505,6 +507,8 @@ github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
github.com/mudler/edgevpn v0.30.1 h1:4yyhNFJX62NpRp50sxiyZE5E/sdAqEZX+aE5Mv7QS60=
github.com/mudler/edgevpn v0.30.1/go.mod h1:IAJkkJ0oH3rwsSGOGTFT4UBYFqYuD/QyaKzTLB3P/eU=
github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=
github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc/go.mod h1:O7SwdSWMilAWhBZMK9N9Y/oBDyMMzshE3ju8Xkexwig=
github.com/mudler/go-processmanager v0.0.0-20240820160718-8b802d3ecf82 h1:FVT07EI8njvsD4tC2Hw8Xhactp5AWhsQWD4oTeQuSAU=
github.com/mudler/go-processmanager v0.0.0-20240820160718-8b802d3ecf82/go.mod h1:Urp7LG5jylKoDq0663qeBh0pINGcRl35nXdKx82PSoU=
github.com/mudler/water v0.0.0-20221010214108-8c7313014ce0 h1:Qh6ghkMgTu6siFbTf7L3IszJmshMhXxNL4V+t7IIA6w=

View File

@@ -10,17 +10,19 @@ import (
"strings"
"time"
"github.com/klauspost/cpuid/v2"
grpc "github.com/mudler/LocalAI/pkg/grpc"
"github.com/mudler/LocalAI/pkg/library"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/mudler/LocalAI/pkg/xsysinfo"
"github.com/phayes/freeport"
"github.com/rs/zerolog/log"
"github.com/elliotchance/orderedmap/v2"
)
const (
LLamaCPP = "llama-cpp"
)
var Aliases map[string]string = map[string]string{
"go-llama": LLamaCPP,
"llama": LLamaCPP,
@@ -40,22 +42,7 @@ var TypeAlias map[string]string = map[string]string{
"transformers-musicgen": "MusicgenForConditionalGeneration",
}
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
const (
LLamaCPP = "llama-cpp"
LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX512 = "llama-cpp-avx512"
LLamaCPPAVX = "llama-cpp-avx"
LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda"
LLamaCPPHipblas = "llama-cpp-hipblas"
LLamaCPPSycl16 = "llama-cpp-sycl_16"
LLamaCPPSycl32 = "llama-cpp-sycl_32"
LLamaCPPGRPC = "llama-cpp-grpc"
WhisperBackend = "whisper"
StableDiffusionGGMLBackend = "stablediffusion-ggml"
PiperBackend = "piper"
@@ -65,18 +52,6 @@ const (
LocalStoreBackend = "local-store"
)
var llamaCPPVariants = []string{
LLamaCPPAVX2,
LLamaCPPAVX512,
LLamaCPPAVX,
LLamaCPPFallback,
LLamaCPPCUDA,
LLamaCPPHipblas,
LLamaCPPSycl16,
LLamaCPPSycl32,
LLamaCPPGRPC,
}
func backendPath(assetDir, backend string) string {
return filepath.Join(assetDir, "backend-assets", "grpc", backend)
}
@@ -105,32 +80,9 @@ ENTRY:
continue
}
// Skip the llama.cpp variants if we are autoDetecting
// But we always load the fallback variant if it exists
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
continue
}
backends[e.Name()] = []string{}
}
// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
if AutoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service
foundVariants := map[string]bool{}
if _, ok := backends[LLamaCPP]; !ok {
for _, e := range entry {
for _, v := range llamaCPPVariants {
if strings.Contains(e.Name(), v) && !foundVariants[v] {
backends[LLamaCPP] = append(backends[LLamaCPP], v)
foundVariants[v] = true
}
}
}
}
}
return backends, nil
}
@@ -140,12 +92,7 @@ func orderBackends(backends map[string][]string) ([]string, error) {
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
// sets a priority list - first has more priority
priorityList := []string{
// First llama.cpp(variants)
// We keep the fallback to prevent that if the llama.cpp variants
// that depends on shared libs if breaks have still a safety net.
LLamaCPP, LLamaCPPFallback,
}
priorityList := []string{}
toTheEnd := []string{
// last has to be huggingface
@@ -178,108 +125,9 @@ func orderBackends(backends map[string][]string) ([]string, error) {
return orderedBackends.Keys(), nil
}
// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
// Note: this is now relevant only for llama.cpp
func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
// Select backend now just for llama.cpp
if backend != LLamaCPP {
return ""
}
// Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" {
log.Info().Msgf("[%s] attempting to load with GRPC variant", LLamaCPPGRPC)
return backendPath(assetDir, LLamaCPPGRPC)
}
// Check for GPU-binaries that are shipped with single binary releases
gpuBinaries := map[string]string{
"nvidia": LLamaCPPCUDA,
"amd": LLamaCPPHipblas,
"intel": LLamaCPPSycl16,
}
if !f16 {
gpuBinaries["intel"] = LLamaCPPSycl32
}
for vendor, binary := range gpuBinaries {
if xsysinfo.HasGPU(vendor) {
p := backendPath(assetDir, binary)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with %s variant (vendor: %s)", backend, binary, vendor)
return p
}
}
}
// No GPU found or no specific binaries found, try to load the CPU variant(s)
// Select a binary based on availability/capability
selectedProcess := ""
// Check if we have a native build (llama-cpp) and use that
if _, err := os.Stat(backendPath(assetDir, LLamaCPPFallback)); err == nil {
log.Debug().Msgf("[%s] %s variant available", LLamaCPPFallback, backend)
selectedProcess = backendPath(assetDir, LLamaCPPFallback)
}
// Check if we have a native build (llama-cpp) and use that instead
// As a reminder, we do ultimately attempt again with the fallback variant
// If things fail with what we select here
if _, err := os.Stat(backendPath(assetDir, LLamaCPP)); err == nil {
log.Debug().Msgf("[%s] attempting to load with native variant", backend)
selectedProcess = backendPath(assetDir, LLamaCPP)
}
// IF we find any optimized binary, we use that
if xsysinfo.HasCPUCaps(cpuid.AVX512F) {
p := backendPath(assetDir, LLamaCPPAVX512)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX512 variant", backend)
selectedProcess = p
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX2) {
p := backendPath(assetDir, LLamaCPPAVX2)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
selectedProcess = p
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
p := backendPath(assetDir, LLamaCPPAVX)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
selectedProcess = p
}
}
// Safety measure: check if the binary exists otherwise return empty string
if _, err := os.Stat(selectedProcess); err == nil {
return selectedProcess
}
return ""
}
func attemptLoadingOnFailure(backend string, ml *ModelLoader, o *Options, err error) (*Model, error) {
// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
// We failed somehow starting the binary. For instance, could be that we are missing
// some libraries if running in binary-only mode.
// In this case, we attempt to load the model with the fallback variant.
// If not llama-cpp backend, return the error immediately
if backend != LLamaCPP {
return nil, err
}
log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s', error: %s", backend, LLamaCPPFallback, err.Error())
return ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
}
// starts the grpcModelProcess for the backend, and returns a grpc client
// It also loads the model
func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
return func(modelID, modelName, modelFile string) (*Model, error) {
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
@@ -335,13 +183,6 @@ func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) fu
return nil, fmt.Errorf("referring to a backend not in asset dir: %s", err.Error())
}
if autodetect {
// autoDetect GRPC process to start based on system capabilities
if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
grpcProcess = selectedProcess
}
}
// Check if the file exists
if _, err := os.Stat(grpcProcess); os.IsNotExist(err) {
return nil, fmt.Errorf("backend not found: %s", grpcProcess)
@@ -455,12 +296,9 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
backendToConsume = backend
}
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
if err != nil {
model, err = attemptLoadingOnFailure(backend, ml, o, err)
if err != nil {
return nil, err
}
return nil, err
}
return model.GRPC(o.parallelRequests, ml.wd), nil
@@ -526,7 +364,7 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
}
// append externalBackends supplied by the user via the CLI
for _, b := range ml.GetAllExternalBackends(o) {
for b := range ml.GetAllExternalBackends(o) {
autoLoadBackends = append(autoLoadBackends, b)
}

View File

@@ -0,0 +1,52 @@
#!/bin/bash
set -ex
IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-darwin}"
pushd backend/cpp/llama-cpp
# make llama-cpp-avx && \
# make llama-cpp-avx2 && \
# make llama-cpp-avx512 && \
make llama-cpp-fallback && \
make llama-cpp-grpc && \
make llama-cpp-rpc-server
popd
mkdir -p build/darwin
# cp -rf backend/cpp/llama-cpp/llama-cpp-avx build/darwin/
# cp -rf backend/cpp/llama-cpp/llama-cpp-avx2 build/darwin/
# cp -rf backend/cpp/llama-cpp/llama-cpp-avx512 build/darwin/
cp -rf backend/cpp/llama-cpp/llama-cpp-fallback build/darwin/
cp -rf backend/cpp/llama-cpp/llama-cpp-grpc build/darwin/
cp -rf backend/cpp/llama-cpp/llama-cpp-rpc-server build/darwin/
for file in build/darwin/*; do
LIBS="$(otool -L $file | awk 'NR > 1 { system("echo " $1) } ' | xargs echo)"
for lib in $LIBS; do
mkdir -p build/darwin/lib
# only libraries ending in dylib
if [[ "$lib" == *.dylib ]]; then
if [ -e "$lib" ]; then
cp -rvf "$lib" build/darwin/lib
fi
fi
done
done
cp -rf backend/cpp/llama-cpp/run.sh build/darwin/
PLATFORMARCH="${PLATFORMARCH:-darwin/arm64}"
./local-ai util create-oci-image \
build/darwin/. \
--output build/darwin.tar \
--image-name $IMAGE_NAME \
--platform $PLATFORMARCH
rm -rf build/darwin

View File

@@ -6,6 +6,7 @@ import (
"os"
"runtime"
"testing"
"time"
"github.com/docker/go-connections/nat"
. "github.com/onsi/ginkgo/v2"
@@ -21,6 +22,7 @@ var client *openai.Client
var containerImage = os.Getenv("LOCALAI_IMAGE")
var containerImageTag = os.Getenv("LOCALAI_IMAGE_TAG")
var modelsDir = os.Getenv("LOCALAI_MODELS_DIR")
var backendDir = os.Getenv("LOCALAI_BACKEND_DIR")
var apiEndpoint = os.Getenv("LOCALAI_API_ENDPOINT")
var apiKey = os.Getenv("LOCALAI_API_KEY")
@@ -82,6 +84,12 @@ func startDockerImage() {
Expect(err).To(Not(HaveOccurred()))
md := cwd + "/models"
bd := cwd + "/backends"
if backendDir != "" {
bd = backendDir
}
if modelsDir != "" {
md = modelsDir
}
@@ -99,20 +107,28 @@ func startDockerImage() {
},
Env: map[string]string{
"MODELS_PATH": "/models",
"BACKENDS_PATH": "/backends",
"DEBUG": "true",
"THREADS": fmt.Sprint(proc),
"LOCALAI_SINGLE_ACTIVE_BACKEND": "true",
},
Files: []testcontainers.ContainerFile{
Mounts: testcontainers.ContainerMounts{
{
HostFilePath: md,
ContainerFilePath: "/models",
FileMode: 0o755,
Source: testcontainers.DockerBindMountSource{
HostPath: md,
},
Target: "/models",
},
{
Source: testcontainers.DockerBindMountSource{
HostPath: bd,
},
Target: "/backends",
},
},
WaitingFor: wait.ForAll(
wait.ForListeningPort(nat.Port(defaultApiPort)),
// wait.ForHTTP("/v1/models").WithPort(nat.Port(apiPort)).WithStartupTimeout(50*time.Minute),
wait.ForListeningPort(nat.Port(defaultApiPort)).WithStartupTimeout(10*time.Minute),
wait.ForHTTP("/v1/models").WithPort(nat.Port(defaultApiPort)).WithStartupTimeout(10*time.Minute),
),
}