feat(llama.cpp): add distributed llama.cpp inferencing (#2324)

* feat(llama.cpp): support distributed llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat: let tweak how chat messages are merged together

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Makefile: register to ALL_GRPC_BACKENDS

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring, allow disable auto-detection of backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* minor fixups

Signed-off-by: mudler <mudler@localai.io>

* feat: add cmd to start rpc-server from llama.cpp

Signed-off-by: mudler <mudler@localai.io>

* ci: add ccache

Signed-off-by: mudler <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Signed-off-by: mudler <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2024-05-15 01:17:02 +02:00
committed by GitHub
parent 29909666c3
commit c89271b2e4
11 changed files with 222 additions and 82 deletions
+5 -4
View File
@@ -13,8 +13,9 @@ type Context struct {
var CLI struct {
Context `embed:""`
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
}
+37
View File
@@ -0,0 +1,37 @@
package cli
import (
"os"
"syscall"
"github.com/go-skynet/LocalAI/pkg/assets"
"github.com/rs/zerolog/log"
)
type LLAMACPPWorkerCMD struct {
Args []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
}
func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
// Extract files from the embedded FS
err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
if err != nil {
log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
}
return syscall.Exec(
assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
),
append([]string{
assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
)}, r.Args...),
os.Environ())
}
+27 -6
View File
@@ -93,6 +93,8 @@ type Diffusers struct {
ControlNet string `yaml:"control_net"`
}
// LLMConfig is a struct that holds the configuration that are
// generic for most of the LLM backends.
type LLMConfig struct {
SystemPrompt string `yaml:"system_prompt"`
TensorSplit string `yaml:"tensor_split"`
@@ -144,6 +146,7 @@ type LLMConfig struct {
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
}
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Device string `yaml:"device"`
@@ -151,13 +154,31 @@ type AutoGPTQ struct {
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
}
// TemplateConfig is a struct that holds the configuration of the templating system
type TemplateConfig struct {
Chat string `yaml:"chat"`
ChatMessage string `yaml:"chat_message"`
Completion string `yaml:"completion"`
Edit string `yaml:"edit"`
Functions string `yaml:"function"`
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
// Chat is the template used in the chat completion endpoint
Chat string `yaml:"chat"`
// ChatMessage is the template used for chat messages
ChatMessage string `yaml:"chat_message"`
// Completion is the template used for completion requests
Completion string `yaml:"completion"`
// Edit is the template used for edit completion requests
Edit string `yaml:"edit"`
// Functions is the template used when tools are present in the client requests
Functions string `yaml:"function"`
// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
// Note: this is mostly consumed for backends such as vllm and transformers
// that can use the tokenizers specified in the JSON config files of the models
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
// It defaults to \n
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
}
func (c *BackendConfig) SetFunctionCallString(s string) {
+6 -1
View File
@@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
mess = append(mess, content)
}
predInput = strings.Join(mess, "\n")
joinCharacter := "\n"
if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
}
predInput = strings.Join(mess, joinCharacter)
log.Debug().Msgf("Prompt (before templating): %s", predInput)
templateFile := ""