mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-21 17:28:57 -05:00
feat(speculative-sampling): allow to specify a draft model in the model config (#1052)
**Description** This PR fixes #1013. It adds `draft_model` and `n_draft` to the model YAML config in order to load models with speculative sampling. This should be compatible as well with grammars. example: ```yaml backend: llama context_size: 1024 name: my-model-name parameters: model: foo-bar n_draft: 16 draft_model: model-name ``` --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
247d85b523
commit
8ccf5b2044
@@ -42,6 +42,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
|
||||
Seed: int32(c.Seed),
|
||||
NBatch: int32(b),
|
||||
NoMulMatQ: c.NoMulMatQ,
|
||||
DraftModel: c.DraftModel,
|
||||
AudioPath: c.VallE.AudioPath,
|
||||
LoraAdapter: c.LoraAdapter,
|
||||
LoraBase: c.LoraBase,
|
||||
@@ -79,6 +80,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
|
||||
return &pb.PredictOptions{
|
||||
Temperature: float32(c.Temperature),
|
||||
TopP: float32(c.TopP),
|
||||
NDraft: c.NDraft,
|
||||
TopK: int32(c.TopK),
|
||||
Tokens: int32(c.Maxtokens),
|
||||
Threads: int32(c.Threads),
|
||||
|
||||
@@ -101,6 +101,8 @@ type LLMConfig struct {
|
||||
LoraAdapter string `yaml:"lora_adapter"`
|
||||
LoraBase string `yaml:"lora_base"`
|
||||
NoMulMatQ bool `yaml:"no_mulmatq"`
|
||||
DraftModel string `yaml:"draft_model"`
|
||||
NDraft int32 `yaml:"n_draft"`
|
||||
}
|
||||
|
||||
type AutoGPTQ struct {
|
||||
|
||||
Reference in New Issue
Block a user