feat(llama.cpp): add distributed llama.cpp inferencing (#2324)

* feat(llama.cpp): support distributed llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat: let tweak how chat messages are merged together Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactor Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Makefile: register to ALL_GRPC_BACKENDS Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring, allow disable auto-detection of backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * minor fixups Signed-off-by: mudler <mudler@localai.io> * feat: add cmd to start rpc-server from llama.cpp Signed-off-by: mudler <mudler@localai.io> * ci: add ccache Signed-off-by: mudler <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: mudler <mudler@localai.io>
2026-04-23 19:51:33 -05:00 · 2024-05-15 01:17:02 +02:00
parent 29909666c3
commit c89271b2e4
11 changed files with 222 additions and 82 deletions
@@ -13,8 +13,9 @@ type Context struct {
 var CLI struct {
 	Context `embed:""`

-	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
-	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
-	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
-	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+	Run            RunCMD            `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Models         ModelsCMD         `cmd:"" help:"Manage LocalAI models and definitions"`
+	TTS            TTSCMD            `cmd:"" help:"Convert text to speech"`
+	Transcript     TranscriptCMD     `cmd:"" help:"Convert audio to text"`
+	LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
 }
@@ -0,0 +1,37 @@
+package cli
+
+import (
+	"os"
+	"syscall"
+
+	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/rs/zerolog/log"
+)
+
+type LLAMACPPWorkerCMD struct {
+	Args              []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
+	BackendAssetsPath string   `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+}
+
+func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
+	// Extract files from the embedded FS
+	err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
+	log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
+	if err != nil {
+		log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+	}
+
+	return syscall.Exec(
+		assets.ResolvePath(
+			r.BackendAssetsPath,
+			"util",
+			"llama-cpp-rpc-server",
+		),
+		append([]string{
+			assets.ResolvePath(
+				r.BackendAssetsPath,
+				"util",
+				"llama-cpp-rpc-server",
+			)}, r.Args...),
+		os.Environ())
+}
@@ -93,6 +93,8 @@ type Diffusers struct {
 	ControlNet       string  `yaml:"control_net"`
 }

+// LLMConfig is a struct that holds the configuration that are
+// generic for most of the LLM backends.
 type LLMConfig struct {
 	SystemPrompt    string   `yaml:"system_prompt"`
 	TensorSplit     string   `yaml:"tensor_split"`
@@ -144,6 +146,7 @@ type LLMConfig struct {
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }

+// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
 	Device           string `yaml:"device"`
@@ -151,13 +154,31 @@ type AutoGPTQ struct {
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

+// TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
-	Chat                 string `yaml:"chat"`
-	ChatMessage          string `yaml:"chat_message"`
-	Completion           string `yaml:"completion"`
-	Edit                 string `yaml:"edit"`
-	Functions            string `yaml:"function"`
-	UseTokenizerTemplate bool   `yaml:"use_tokenizer_template"`
+	// Chat is the template used in the chat completion endpoint
+	Chat string `yaml:"chat"`
+
+	// ChatMessage is the template used for chat messages
+	ChatMessage string `yaml:"chat_message"`
+
+	// Completion is the template used for completion requests
+	Completion string `yaml:"completion"`
+
+	// Edit is the template used for edit completion requests
+	Edit string `yaml:"edit"`
+
+	// Functions is the template used when tools are present in the client requests
+	Functions string `yaml:"function"`
+
+	// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
+	// Note: this is mostly consumed for backends such as vllm and transformers
+	// that can use the tokenizers specified in the JSON config files of the models
+	UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
+
+	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
+	// It defaults to \n
+	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 }

 func (c *BackendConfig) SetFunctionCallString(s string) {
@@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				mess = append(mess, content)
 			}

-			predInput = strings.Join(mess, "\n")
+			joinCharacter := "\n"
+			if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
+				joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
+			}
+
+			predInput = strings.Join(mess, joinCharacter)
 			log.Debug().Msgf("Prompt (before templating): %s", predInput)

 			templateFile := ""