diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 023a14bcc..98ac50812 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if request.Quantization != "": engine_args.quantization = request.Quantization + if request.LoadFormat != "": + engine_args.load_format = request.LoadFormat if request.GPUMemoryUtilization != 0: engine_args.gpu_memory_utilization = request.GPUMemoryUtilization if request.TrustRemoteCode: diff --git a/core/backend/options.go b/core/backend/options.go index 90d563e03..82c582c84 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -139,6 +139,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { DraftModel: c.DraftModel, AudioPath: c.VallE.AudioPath, Quantization: c.Quantization, + LoadFormat: c.LoadFormat, GPUMemoryUtilization: c.GPUMemoryUtilization, TrustRemoteCode: c.TrustRemoteCode, EnforceEager: c.EnforceEager, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index b386d0965..c3d1063db 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -143,6 +143,7 @@ type LLMConfig struct { DraftModel string `yaml:"draft_model"` NDraft int32 `yaml:"n_draft"` Quantization string `yaml:"quantization"` + LoadFormat string `yaml:"load_format"` GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM EnforceEager bool `yaml:"enforce_eager"` // vLLM