feat(stablediffusion): Passthrough more parameters to support z-image and flux2 (#7419)

* feat(stablediffusion): Passthrough more parameters to support z-image and flux2

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* chore(z-image): Add Z-Image-Turbo GGML to library

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(stablediffusion-ggml): flush stderr and check errors when writing PNG

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(stablediffusion-ggml): Re-allocate Go strings in C++

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(stablediffusion-ggml): Try to avoid segfaults

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(stablediffusion-ggml): Init sample and easycache params

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Richard Palethorpe
2025-12-04 16:08:21 +00:00
committed by GitHub
parent 100ebdfa2c
commit c2e4a1f29b
5 changed files with 318 additions and 38 deletions

View File

@@ -1,4 +1,5 @@
#include "stable-diffusion.h"
#include <cmath>
#include <cstdint>
#define GGML_MAX_NAME 128
@@ -21,6 +22,7 @@
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#define STB_IMAGE_RESIZE_STATIC
#include "stb_image_resize.h"
#include <stdlib.h>
// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
const char* sample_method_str[] = {
@@ -55,6 +57,73 @@ const char* schedulers[] = {
static_assert(std::size(schedulers) == SCHEDULER_COUNT, "schedulers mismatch");
// New enum string arrays
const char* rng_type_str[] = {
"std_default",
"cuda",
"cpu",
};
static_assert(std::size(rng_type_str) == RNG_TYPE_COUNT, "rng type mismatch");
const char* prediction_str[] = {
"default",
"epsilon",
"v",
"edm_v",
"sd3_flow",
"flux_flow",
"flux2_flow",
};
static_assert(std::size(prediction_str) == PREDICTION_COUNT, "prediction mismatch");
const char* lora_apply_mode_str[] = {
"auto",
"immediately",
"at_runtime",
};
static_assert(std::size(lora_apply_mode_str) == LORA_APPLY_MODE_COUNT, "lora apply mode mismatch");
constexpr const char* sd_type_str[] = {
"f32", // 0
"f16", // 1
"q4_0", // 2
"q4_1", // 3
nullptr, // 4
nullptr, // 5
"q5_0", // 6
"q5_1", // 7
"q8_0", // 8
"q8_1", // 9
"q2_k", // 10
"q3_k", // 11
"q4_k", // 12
"q5_k", // 13
"q6_k", // 14
"q8_k", // 15
"iq2_xxs", // 16
"iq2_xs", // 17
"iq3_xxs", // 18
"iq1_s", // 19
"iq4_nl", // 20
"iq3_s", // 21
"iq2_s", // 22
"iq4_xs", // 23
"i8", // 24
"i16", // 25
"i32", // 26
"i64", // 27
"f64", // 28
"iq1_m", // 29
"bf16", // 30
nullptr, nullptr, nullptr, nullptr, // 31-34
"tq1_0", // 35
"tq2_0", // 36
nullptr, nullptr, // 37-38
"mxfp4" // 39
};
static_assert(std::size(sd_type_str) == SD_TYPE_COUNT, "sd type mismatch");
sd_ctx_params_t ctx_params;
sd_ctx_t* sd_c;
// Moved from the context (load time) to generation time params
scheduler_t scheduler = SCHEDULER_COUNT;
@@ -99,7 +168,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads
const char *stableDiffusionModel = "";
if (diff == 1 ) {
stableDiffusionModel = model;
stableDiffusionModel = strdup(model);
model = "";
}
@@ -110,8 +179,38 @@ int load_model(const char *model, char *model_path, char* options[], int threads
const char *vae_path = "";
const char *scheduler_str = "";
const char *sampler = "";
const char *clip_vision_path = "";
const char *llm_path = "";
const char *llm_vision_path = "";
const char *diffusion_model_path = stableDiffusionModel;
const char *high_noise_diffusion_model_path = "";
const char *taesd_path = "";
const char *control_net_path = "";
const char *embedding_dir = "";
const char *photo_maker_path = "";
const char *tensor_type_rules = "";
char *lora_dir = model_path;
bool lora_dir_allocated = false;
bool vae_decode_only = true;
int n_threads = threads;
enum sd_type_t wtype = SD_TYPE_COUNT;
enum rng_type_t rng_type = CUDA_RNG;
enum rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
enum prediction_t prediction = DEFAULT_PRED;
enum lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
bool offload_params_to_cpu = false;
bool keep_clip_on_cpu = false;
bool keep_control_net_on_cpu = false;
bool keep_vae_on_cpu = false;
bool diffusion_flash_attn = false;
bool tae_preview_only = false;
bool diffusion_conv_direct = false;
bool vae_conv_direct = false;
bool force_sdxl_vae_conv_scale = false;
bool chroma_use_dit_mask = true;
bool chroma_use_t5_mask = false;
int chroma_t5_mask_pad = 1;
float flow_shift = INFINITY;
fprintf(stderr, "parsing options: %p\n", options);
@@ -124,16 +223,16 @@ int load_model(const char *model, char *model_path, char* options[], int threads
}
if (!strcmp(optname, "clip_l_path")) {
clip_l_path = optval;
clip_l_path = strdup(optval);
}
if (!strcmp(optname, "clip_g_path")) {
clip_g_path = optval;
clip_g_path = strdup(optval);
}
if (!strcmp(optname, "t5xxl_path")) {
t5xxl_path = optval;
t5xxl_path = strdup(optval);
}
if (!strcmp(optname, "vae_path")) {
vae_path = optval;
vae_path = strdup(optval);
}
if (!strcmp(optname, "scheduler")) {
scheduler_str = optval;
@@ -148,43 +247,167 @@ int load_model(const char *model, char *model_path, char* options[], int threads
std::filesystem::path lora_path(optval);
std::filesystem::path full_lora_path = model_path_str / lora_path;
lora_dir = strdup(full_lora_path.string().c_str());
lora_dir_allocated = true;
fprintf(stderr, "Lora dir resolved to: %s\n", lora_dir);
} else {
lora_dir = strdup(optval);
lora_dir_allocated = true;
fprintf(stderr, "No model path provided, using lora dir as-is: %s\n", lora_dir);
}
}
// New parsing
if (!strcmp(optname, "clip_vision_path")) clip_vision_path = strdup(optval);
if (!strcmp(optname, "llm_path")) llm_path = strdup(optval);
if (!strcmp(optname, "llm_vision_path")) llm_vision_path = strdup(optval);
if (!strcmp(optname, "diffusion_model_path")) diffusion_model_path = strdup(optval);
if (!strcmp(optname, "high_noise_diffusion_model_path")) high_noise_diffusion_model_path = strdup(optval);
if (!strcmp(optname, "taesd_path")) taesd_path = strdup(optval);
if (!strcmp(optname, "control_net_path")) control_net_path = strdup(optval);
if (!strcmp(optname, "embedding_dir")) embedding_dir = strdup(optval);
if (!strcmp(optname, "photo_maker_path")) photo_maker_path = strdup(optval);
if (!strcmp(optname, "tensor_type_rules")) tensor_type_rules = strdup(optval);
if (!strcmp(optname, "vae_decode_only")) vae_decode_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "offload_params_to_cpu")) offload_params_to_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "keep_clip_on_cpu")) keep_clip_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "keep_control_net_on_cpu")) keep_control_net_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "keep_vae_on_cpu")) keep_vae_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "diffusion_flash_attn")) diffusion_flash_attn = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "tae_preview_only")) tae_preview_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "diffusion_conv_direct")) diffusion_conv_direct = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "vae_conv_direct")) vae_conv_direct = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "force_sdxl_vae_conv_scale")) force_sdxl_vae_conv_scale = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "chroma_use_dit_mask")) chroma_use_dit_mask = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "chroma_use_t5_mask")) chroma_use_t5_mask = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
if (!strcmp(optname, "n_threads")) n_threads = atoi(optval);
if (!strcmp(optname, "chroma_t5_mask_pad")) chroma_t5_mask_pad = atoi(optval);
if (!strcmp(optname, "flow_shift")) flow_shift = atof(optval);
if (!strcmp(optname, "rng_type")) {
int found = -1;
for (int m = 0; m < RNG_TYPE_COUNT; m++) {
if (!strcmp(optval, rng_type_str[m])) {
found = m;
break;
}
}
if (found != -1) {
rng_type = (rng_type_t)found;
fprintf(stderr, "Found rng_type: %s\n", optval);
} else {
fprintf(stderr, "Invalid rng_type: %s, using default\n", optval);
}
}
if (!strcmp(optname, "sampler_rng_type")) {
int found = -1;
for (int m = 0; m < RNG_TYPE_COUNT; m++) {
if (!strcmp(optval, rng_type_str[m])) {
found = m;
break;
}
}
if (found != -1) {
sampler_rng_type = (rng_type_t)found;
fprintf(stderr, "Found sampler_rng_type: %s\n", optval);
} else {
fprintf(stderr, "Invalid sampler_rng_type: %s, using default\n", optval);
}
}
if (!strcmp(optname, "prediction")) {
int found = -1;
for (int m = 0; m < PREDICTION_COUNT; m++) {
if (!strcmp(optval, prediction_str[m])) {
found = m;
break;
}
}
if (found != -1) {
prediction = (prediction_t)found;
fprintf(stderr, "Found prediction: %s\n", optval);
} else {
fprintf(stderr, "Invalid prediction: %s, using default\n", optval);
}
}
if (!strcmp(optname, "lora_apply_mode")) {
int found = -1;
for (int m = 0; m < LORA_APPLY_MODE_COUNT; m++) {
if (!strcmp(optval, lora_apply_mode_str[m])) {
found = m;
break;
}
}
if (found != -1) {
lora_apply_mode = (lora_apply_mode_t)found;
fprintf(stderr, "Found lora_apply_mode: %s\n", optval);
} else {
fprintf(stderr, "Invalid lora_apply_mode: %s, using default\n", optval);
}
}
if (!strcmp(optname, "wtype")) {
int found = -1;
for (int m = 0; m < SD_TYPE_COUNT; m++) {
if (sd_type_str[m] && !strcmp(optval, sd_type_str[m])) {
found = m;
break;
}
}
if (found != -1) {
wtype = (sd_type_t)found;
fprintf(stderr, "Found wtype: %s\n", optval);
} else {
fprintf(stderr, "Invalid wtype: %s, using default\n", optval);
}
}
}
fprintf(stderr, "parsed options\n");
fprintf (stderr, "Creating context\n");
sd_ctx_params_t ctx_params;
sd_ctx_params_init(&ctx_params);
ctx_params.model_path = model;
ctx_params.clip_l_path = clip_l_path;
ctx_params.clip_g_path = clip_g_path;
ctx_params.clip_vision_path = clip_vision_path;
ctx_params.t5xxl_path = t5xxl_path;
ctx_params.diffusion_model_path = stableDiffusionModel;
ctx_params.llm_path = llm_path;
ctx_params.llm_vision_path = llm_vision_path;
ctx_params.diffusion_model_path = diffusion_model_path;
ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path;
ctx_params.vae_path = vae_path;
ctx_params.taesd_path = "";
ctx_params.control_net_path = "";
ctx_params.taesd_path = taesd_path;
ctx_params.control_net_path = control_net_path;
ctx_params.lora_model_dir = lora_dir;
ctx_params.embedding_dir = "";
ctx_params.vae_decode_only = false;
ctx_params.embedding_dir = embedding_dir;
ctx_params.photo_maker_path = photo_maker_path;
ctx_params.tensor_type_rules = tensor_type_rules;
ctx_params.vae_decode_only = vae_decode_only;
// XXX: Setting to true causes a segfault on the second run
ctx_params.free_params_immediately = false;
ctx_params.n_threads = threads;
ctx_params.rng_type = STD_DEFAULT_RNG;
ctx_params.n_threads = n_threads;
ctx_params.rng_type = rng_type;
ctx_params.keep_clip_on_cpu = keep_clip_on_cpu;
if (wtype != SD_TYPE_COUNT) ctx_params.wtype = wtype;
if (sampler_rng_type != RNG_TYPE_COUNT) ctx_params.sampler_rng_type = sampler_rng_type;
if (prediction != PREDICTION_COUNT) ctx_params.prediction = prediction;
if (lora_apply_mode != LORA_APPLY_MODE_COUNT) ctx_params.lora_apply_mode = lora_apply_mode;
ctx_params.offload_params_to_cpu = offload_params_to_cpu;
ctx_params.keep_control_net_on_cpu = keep_control_net_on_cpu;
ctx_params.keep_vae_on_cpu = keep_vae_on_cpu;
ctx_params.diffusion_flash_attn = diffusion_flash_attn;
ctx_params.tae_preview_only = tae_preview_only;
ctx_params.diffusion_conv_direct = diffusion_conv_direct;
ctx_params.vae_conv_direct = vae_conv_direct;
ctx_params.force_sdxl_vae_conv_scale = force_sdxl_vae_conv_scale;
ctx_params.chroma_use_dit_mask = chroma_use_dit_mask;
ctx_params.chroma_use_t5_mask = chroma_use_t5_mask;
ctx_params.chroma_t5_mask_pad = chroma_t5_mask_pad;
ctx_params.flow_shift = flow_shift;
sd_ctx_t* sd_ctx = new_sd_ctx(&ctx_params);
if (sd_ctx == NULL) {
fprintf (stderr, "failed loading model (generic error)\n");
// Clean up allocated memory
if (lora_dir_allocated && lora_dir) {
free(lora_dir);
}
// TODO: Clean up allocated memory
return 1;
}
fprintf (stderr, "Created context: OK\n");
@@ -215,11 +438,6 @@ int load_model(const char *model, char *model_path, char* options[], int threads
sd_c = sd_ctx;
// Clean up allocated memory
if (lora_dir_allocated && lora_dir) {
free(lora_dir);
}
return 0;
}
@@ -248,6 +466,9 @@ sd_tiling_params_t* sd_img_gen_params_get_vae_tiling_params(sd_img_gen_params_t
sd_img_gen_params_t* sd_img_gen_params_new(void) {
sd_img_gen_params_t *params = (sd_img_gen_params_t *)std::malloc(sizeof(sd_img_gen_params_t));
sd_img_gen_params_init(params);
sd_sample_params_init(&params->sample_params);
sd_easycache_params_init(&params->easycache);
params->control_strength = 0.9f;
return params;
}
@@ -265,7 +486,7 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed) {
params->seed = seed;
}
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count) {
sd_image_t* results;
@@ -445,6 +666,10 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
}
}
fprintf(stderr, "Generating image with params: \nctx\n---\n%s\ngen\n---\n%s\n",
sd_ctx_params_to_str(&ctx_params),
sd_img_gen_params_to_str(p));
results = generate_image(sd_c, p);
std::free(p);
@@ -477,9 +702,12 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
fprintf (stderr, "Channel: %d\n", results[0].channel);
fprintf (stderr, "Data: %p\n", results[0].data);
stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
results[0].data, 0, NULL);
fprintf (stderr, "Saved resulting image to '%s'\n", dst);
int ret = stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
results[0].data, 0, NULL);
if (ret)
fprintf (stderr, "Saved resulting image to '%s'\n", dst);
else
fprintf(stderr, "Failed to write image to '%s'\n", dst);
// Clean up
free(results[0].data);
@@ -490,9 +718,10 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
for (auto buffer : ref_image_buffers) {
if (buffer) free(buffer);
}
fprintf (stderr, "gen_image is done: %s", dst);
fprintf (stderr, "gen_image is done: %s\n", dst);
fflush(stderr);
return 0;
return !ret;
}
int unload() {

View File

@@ -22,7 +22,7 @@ type SDGGML struct {
var (
LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int
GenImage func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []string, refImagesCount int) int
GenImage func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []uintptr, refImagesCount int) int
TilingParamsSetEnabled func(params uintptr, enabled bool)
TilingParamsSetTileSizes func(params uintptr, tileSizeX int, tileSizeY int)
@@ -95,12 +95,12 @@ func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
sd.cfgScale = opts.CFGScale
ret := LoadModel(modelFile, modelPathC, options, opts.Threads, diffusionModel)
runtime.KeepAlive(keepAlive)
fmt.Fprintf(os.Stderr, "LoadModel: %d\n", ret)
if ret != 0 {
return fmt.Errorf("could not load model")
}
runtime.KeepAlive(keepAlive)
return nil
}
@@ -123,10 +123,15 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
}
}
// At the time of writing Purego doesn't recurse into slices and convert Go strings to pointers so we need to do that
var keepAlive []any
refImagesCount := len(opts.RefImages)
refImages := make([]string, refImagesCount, refImagesCount+1)
copy(refImages, opts.RefImages)
*(*uintptr)(unsafe.Add(unsafe.Pointer(&refImages), refImagesCount)) = 0
refImages := make([]uintptr, refImagesCount, refImagesCount+1)
for i, ri := range opts.RefImages {
bytep := CString(ri)
refImages[i] = uintptr(unsafe.Pointer(bytep))
keepAlive = append(keepAlive, bytep)
}
// Default strength for img2img (0.75 is a good default)
strength := float32(0.75)
@@ -140,6 +145,8 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
TilingParamsSetEnabled(vaep, false)
ret := GenImage(p, int(opts.Step), dst, sd.cfgScale, srcImage, strength, maskImage, refImages, refImagesCount)
runtime.KeepAlive(keepAlive)
fmt.Fprintf(os.Stderr, "GenImage: %d\n", ret)
if ret != 0 {
return fmt.Errorf("inference failed")
}

View File

@@ -17,7 +17,7 @@ void sd_img_gen_params_set_dimensions(sd_img_gen_params_t *params, int width, in
void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed);
int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel);
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count);
#ifdef __cplusplus
}
#endif

View File

@@ -20911,6 +20911,9 @@
overrides:
parameters:
model: flux1-dev-Q2_K.gguf
options:
- scheduler:simple
- keep_clip_on_cpu:true
files:
- filename: "flux1-dev-Q2_K.gguf"
sha256: "b8c464bc0f10076ef8f00ba040d220d90c7993f7c4245ae80227d857f65df105"
@@ -21078,6 +21081,32 @@
- filename: t5xxl_fp16.safetensors
sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635
uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors
- &zimage
name: Z-Image-Turbo
icon: https://z-image.ai/logo.png
license: apache-2.0
description: |
Z-Image is a powerful and highly efficient image generation model with 6B parameters. Currently there are three variants of which this is the Turbo edition.
🚀 Z-Image-Turbo A distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers ⚡sub-second inference latency⚡ on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
urls:
- https://github.com/Tongyi-MAI/Z-Image
tags:
- text-to-image
- z-image
- gpu
url: "github:mudler/LocalAI/gallery/z-image-ggml.yaml@master"
files:
- filename: Qwen3-4B.Q4_K_M.gguf
sha256: a37931937683a723ae737a0c6fc67dab7782fd8a1b9dea2ca445b7a1dbd5ca3a
uri: huggingface://MaziyarPanahi/Qwen3-4B-GGUF/Qwen3-4B.Q4_K_M.gguf
- filename: z_image_turbo-Q4_0.gguf
sha256: 14b375ab4f226bc5378f68f37e899ef3c2242b8541e61e2bc1aff40976086fbd
uri: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_0.gguf
- filename: ae.safetensors
sha256: afc8e28272cd15db3919bacdb6918ce9c1ed22e96cb12c4d5ed0fba823529e38
uri: https://huggingface.co/ChuckMcSneed/FLUX.1-dev/resolve/main/ae.safetensors
- &whisper
url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master" ## Whisper
name: "whisper-1"

15
gallery/z-image-ggml.yaml Normal file
View File

@@ -0,0 +1,15 @@
---
name: "Z-Image-GGML"
config_file: |
backend: stablediffusion-ggml
cfg_scale: 1
name: z-image-test
options:
- diffusion_model
- llm_path:Qwen3-4B.Q4_K_M.gguf
- vae_path:ae.safetensors
- offload_params_to_cpu:true
parameters:
model: z_image_turbo-Q4_K.gguf
step: 25