feat(stablediffusion): Passthrough more parameters to support z-image and flux2 (#7419)

* feat(stablediffusion): Passthrough more parameters to support z-image and flux2 Signed-off-by: Richard Palethorpe <io@richiejp.com> * chore(z-image): Add Z-Image-Turbo GGML to library Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(stablediffusion-ggml): flush stderr and check errors when writing PNG Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(stablediffusion-ggml): Re-allocate Go strings in C++ Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(stablediffusion-ggml): Try to avoid segfaults Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(stablediffusion-ggml): Init sample and easycache params Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-01-04 09:40:32 -06:00 · 2025-12-04 16:08:21 +00:00
parent 100ebdfa2c
commit c2e4a1f29b
5 changed files with 318 additions and 38 deletions
--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -1,4 +1,5 @@
 #include "stable-diffusion.h"
+#include <cmath>
 #include <cstdint>
 #define GGML_MAX_NAME 128

@@ -21,6 +22,7 @@
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
+#include <stdlib.h>

 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
@@ -55,6 +57,73 @@ const char* schedulers[] = {

 static_assert(std::size(schedulers) == SCHEDULER_COUNT, "schedulers mismatch");

+// New enum string arrays
+const char* rng_type_str[] = {
+    "std_default",
+    "cuda",
+    "cpu",
+};
+static_assert(std::size(rng_type_str) == RNG_TYPE_COUNT, "rng type mismatch");
+
+const char* prediction_str[] = {
+    "default",
+    "epsilon",
+    "v",
+    "edm_v",
+    "sd3_flow",
+    "flux_flow",
+    "flux2_flow",
+};
+static_assert(std::size(prediction_str) == PREDICTION_COUNT, "prediction mismatch");
+
+const char* lora_apply_mode_str[] = {
+    "auto",
+    "immediately",
+    "at_runtime",
+};
+static_assert(std::size(lora_apply_mode_str) == LORA_APPLY_MODE_COUNT, "lora apply mode mismatch");
+
+constexpr const char* sd_type_str[] = {
+    "f32",      // 0
+    "f16",      // 1
+    "q4_0",     // 2
+    "q4_1",     // 3
+    nullptr,    // 4
+    nullptr,    // 5
+    "q5_0",     // 6
+    "q5_1",     // 7
+    "q8_0",     // 8
+    "q8_1",     // 9
+    "q2_k",     // 10
+    "q3_k",     // 11
+    "q4_k",     // 12
+    "q5_k",     // 13
+    "q6_k",     // 14
+    "q8_k",     // 15
+    "iq2_xxs",  // 16
+    "iq2_xs",   // 17
+    "iq3_xxs",  // 18
+    "iq1_s",    // 19
+    "iq4_nl",   // 20
+    "iq3_s",    // 21
+    "iq2_s",    // 22
+    "iq4_xs",   // 23
+    "i8",       // 24
+    "i16",      // 25
+    "i32",      // 26
+    "i64",      // 27
+    "f64",      // 28
+    "iq1_m",    // 29
+    "bf16",     // 30
+    nullptr, nullptr, nullptr, nullptr,  // 31-34
+    "tq1_0",    // 35
+    "tq2_0",    // 36
+    nullptr, nullptr,           // 37-38
+    "mxfp4"     // 39
+};
+static_assert(std::size(sd_type_str) == SD_TYPE_COUNT, "sd type mismatch");
+
+sd_ctx_params_t ctx_params;
 sd_ctx_t* sd_c;
 // Moved from the context (load time) to generation time params
 scheduler_t scheduler = SCHEDULER_COUNT;
@@ -99,7 +168,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads

    const char *stableDiffusionModel = "";
    if (diff == 1 ) {
-        stableDiffusionModel = model;
+        stableDiffusionModel = strdup(model);
        model = "";
    }

@@ -110,8 +179,38 @@ int load_model(const char *model, char *model_path, char* options[], int threads
    const char *vae_path  = "";
    const char *scheduler_str = "";
    const char *sampler = "";
+    const char *clip_vision_path = "";
+    const char *llm_path = "";
+    const char *llm_vision_path = "";
+    const char *diffusion_model_path = stableDiffusionModel;
+    const char *high_noise_diffusion_model_path = "";
+    const char *taesd_path  = "";
+    const char *control_net_path = "";
+    const char *embedding_dir = "";
+    const char *photo_maker_path = "";
+    const char *tensor_type_rules = "";
    char *lora_dir = model_path;
-    bool lora_dir_allocated = false;
+
+    bool vae_decode_only = true;
+    int n_threads = threads;
+    enum sd_type_t wtype = SD_TYPE_COUNT;
+    enum rng_type_t rng_type = CUDA_RNG;
+    enum rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
+    enum prediction_t prediction = DEFAULT_PRED;
+    enum lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
+    bool offload_params_to_cpu = false;
+    bool keep_clip_on_cpu = false;
+    bool keep_control_net_on_cpu = false;
+    bool keep_vae_on_cpu = false;
+    bool diffusion_flash_attn = false;
+    bool tae_preview_only = false;
+    bool diffusion_conv_direct = false;
+    bool vae_conv_direct = false;
+    bool force_sdxl_vae_conv_scale = false;
+    bool chroma_use_dit_mask = true;
+    bool chroma_use_t5_mask = false;
+    int chroma_t5_mask_pad = 1;
+    float flow_shift = INFINITY;

    fprintf(stderr, "parsing options: %p\n", options);

@@ -124,16 +223,16 @@ int load_model(const char *model, char *model_path, char* options[], int threads
        }

        if (!strcmp(optname, "clip_l_path")) {
-            clip_l_path = optval;
+            clip_l_path = strdup(optval);
        }
        if (!strcmp(optname, "clip_g_path")) {
-            clip_g_path = optval;
+            clip_g_path = strdup(optval);
        }
        if (!strcmp(optname, "t5xxl_path")) {
-            t5xxl_path = optval;
+            t5xxl_path = strdup(optval);
        }
        if (!strcmp(optname, "vae_path")) {
-            vae_path = optval;
+            vae_path = strdup(optval);
        }
        if (!strcmp(optname, "scheduler")) {
            scheduler_str = optval;
@@ -148,43 +247,167 @@ int load_model(const char *model, char *model_path, char* options[], int threads
                std::filesystem::path lora_path(optval);
                std::filesystem::path full_lora_path = model_path_str / lora_path;
                lora_dir = strdup(full_lora_path.string().c_str());
-                lora_dir_allocated = true;
                fprintf(stderr, "Lora dir resolved to: %s\n", lora_dir);
            } else {
                lora_dir = strdup(optval);
-                lora_dir_allocated = true;
                fprintf(stderr, "No model path provided, using lora dir as-is: %s\n", lora_dir);
            }
        }
+
+        // New parsing
+        if (!strcmp(optname, "clip_vision_path")) clip_vision_path = strdup(optval);
+        if (!strcmp(optname, "llm_path")) llm_path = strdup(optval);
+        if (!strcmp(optname, "llm_vision_path")) llm_vision_path = strdup(optval);
+        if (!strcmp(optname, "diffusion_model_path")) diffusion_model_path = strdup(optval);
+        if (!strcmp(optname, "high_noise_diffusion_model_path")) high_noise_diffusion_model_path = strdup(optval);
+        if (!strcmp(optname, "taesd_path")) taesd_path = strdup(optval);
+        if (!strcmp(optname, "control_net_path")) control_net_path = strdup(optval);
+        if (!strcmp(optname, "embedding_dir")) embedding_dir = strdup(optval);
+        if (!strcmp(optname, "photo_maker_path")) photo_maker_path = strdup(optval);
+        if (!strcmp(optname, "tensor_type_rules")) tensor_type_rules = strdup(optval);
+
+        if (!strcmp(optname, "vae_decode_only")) vae_decode_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "offload_params_to_cpu")) offload_params_to_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "keep_clip_on_cpu")) keep_clip_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "keep_control_net_on_cpu")) keep_control_net_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "keep_vae_on_cpu")) keep_vae_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "diffusion_flash_attn")) diffusion_flash_attn = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "tae_preview_only")) tae_preview_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "diffusion_conv_direct")) diffusion_conv_direct = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "vae_conv_direct")) vae_conv_direct = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "force_sdxl_vae_conv_scale")) force_sdxl_vae_conv_scale = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "chroma_use_dit_mask")) chroma_use_dit_mask = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "chroma_use_t5_mask")) chroma_use_t5_mask = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+
+        if (!strcmp(optname, "n_threads")) n_threads = atoi(optval);
+        if (!strcmp(optname, "chroma_t5_mask_pad")) chroma_t5_mask_pad = atoi(optval);
+
+        if (!strcmp(optname, "flow_shift")) flow_shift = atof(optval);
+
+        if (!strcmp(optname, "rng_type")) {
+            int found = -1;
+            for (int m = 0; m < RNG_TYPE_COUNT; m++) {
+                if (!strcmp(optval, rng_type_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                rng_type = (rng_type_t)found;
+                fprintf(stderr, "Found rng_type: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid rng_type: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "sampler_rng_type")) {
+            int found = -1;
+            for (int m = 0; m < RNG_TYPE_COUNT; m++) {
+                if (!strcmp(optval, rng_type_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                sampler_rng_type = (rng_type_t)found;
+                fprintf(stderr, "Found sampler_rng_type: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid sampler_rng_type: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "prediction")) {
+            int found = -1;
+            for (int m = 0; m < PREDICTION_COUNT; m++) {
+                if (!strcmp(optval, prediction_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                prediction = (prediction_t)found;
+                fprintf(stderr, "Found prediction: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid prediction: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "lora_apply_mode")) {
+            int found = -1;
+            for (int m = 0; m < LORA_APPLY_MODE_COUNT; m++) {
+                if (!strcmp(optval, lora_apply_mode_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                lora_apply_mode = (lora_apply_mode_t)found;
+                fprintf(stderr, "Found lora_apply_mode: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid lora_apply_mode: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "wtype")) {
+            int found = -1;
+            for (int m = 0; m < SD_TYPE_COUNT; m++) {
+                if (sd_type_str[m] && !strcmp(optval, sd_type_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                wtype = (sd_type_t)found;
+                fprintf(stderr, "Found wtype: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid wtype: %s, using default\n", optval);
+            }
+        }
    }

    fprintf(stderr, "parsed options\n");

    fprintf (stderr, "Creating context\n");
-    sd_ctx_params_t ctx_params;
    sd_ctx_params_init(&ctx_params);
    ctx_params.model_path = model;
    ctx_params.clip_l_path = clip_l_path;
    ctx_params.clip_g_path = clip_g_path;
+    ctx_params.clip_vision_path = clip_vision_path;
    ctx_params.t5xxl_path = t5xxl_path;
-    ctx_params.diffusion_model_path = stableDiffusionModel;
+    ctx_params.llm_path = llm_path;
+    ctx_params.llm_vision_path = llm_vision_path;
+    ctx_params.diffusion_model_path = diffusion_model_path;
+    ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path;
    ctx_params.vae_path = vae_path;
-    ctx_params.taesd_path = "";
-    ctx_params.control_net_path = "";
+    ctx_params.taesd_path = taesd_path;
+    ctx_params.control_net_path = control_net_path;
    ctx_params.lora_model_dir = lora_dir;
-    ctx_params.embedding_dir = "";
-    ctx_params.vae_decode_only = false;
+    ctx_params.embedding_dir = embedding_dir;
+    ctx_params.photo_maker_path = photo_maker_path;
+    ctx_params.tensor_type_rules = tensor_type_rules;
+    ctx_params.vae_decode_only = vae_decode_only;
+    // XXX: Setting to true causes a segfault on the second run
    ctx_params.free_params_immediately = false;
-    ctx_params.n_threads = threads;
-    ctx_params.rng_type = STD_DEFAULT_RNG;
+    ctx_params.n_threads = n_threads;
+    ctx_params.rng_type = rng_type;
+    ctx_params.keep_clip_on_cpu = keep_clip_on_cpu;
+    if (wtype != SD_TYPE_COUNT) ctx_params.wtype = wtype;
+    if (sampler_rng_type != RNG_TYPE_COUNT) ctx_params.sampler_rng_type = sampler_rng_type;
+    if (prediction != PREDICTION_COUNT) ctx_params.prediction = prediction;
+    if (lora_apply_mode != LORA_APPLY_MODE_COUNT) ctx_params.lora_apply_mode = lora_apply_mode;
+    ctx_params.offload_params_to_cpu = offload_params_to_cpu;
+    ctx_params.keep_control_net_on_cpu = keep_control_net_on_cpu;
+    ctx_params.keep_vae_on_cpu = keep_vae_on_cpu;
+    ctx_params.diffusion_flash_attn = diffusion_flash_attn;
+    ctx_params.tae_preview_only = tae_preview_only;
+    ctx_params.diffusion_conv_direct = diffusion_conv_direct;
+    ctx_params.vae_conv_direct = vae_conv_direct;
+    ctx_params.force_sdxl_vae_conv_scale = force_sdxl_vae_conv_scale;
+    ctx_params.chroma_use_dit_mask = chroma_use_dit_mask;
+    ctx_params.chroma_use_t5_mask = chroma_use_t5_mask;
+    ctx_params.chroma_t5_mask_pad = chroma_t5_mask_pad;
+    ctx_params.flow_shift = flow_shift;
    sd_ctx_t* sd_ctx = new_sd_ctx(&ctx_params);

    if (sd_ctx == NULL) {
        fprintf (stderr, "failed loading model (generic error)\n");
-        // Clean up allocated memory
-        if (lora_dir_allocated && lora_dir) {
-            free(lora_dir);
-        }
+        // TODO: Clean up allocated memory
        return 1;
    }
    fprintf (stderr, "Created context: OK\n");
@@ -215,11 +438,6 @@ int load_model(const char *model, char *model_path, char* options[], int threads

    sd_c = sd_ctx;

-    // Clean up allocated memory
-    if (lora_dir_allocated && lora_dir) {
-        free(lora_dir);
-    }
-
    return 0;
 }

@@ -248,6 +466,9 @@ sd_tiling_params_t* sd_img_gen_params_get_vae_tiling_params(sd_img_gen_params_t
 sd_img_gen_params_t* sd_img_gen_params_new(void) {
    sd_img_gen_params_t *params = (sd_img_gen_params_t *)std::malloc(sizeof(sd_img_gen_params_t));
    sd_img_gen_params_init(params);
+    sd_sample_params_init(&params->sample_params);
+    sd_easycache_params_init(&params->easycache);
+    params->control_strength = 0.9f;
    return params;
 }

@@ -265,7 +486,7 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed) {
    params->seed = seed;
 }

-int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {
+int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count) {

    sd_image_t* results;

@@ -445,6 +666,10 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
        }
    }

+    fprintf(stderr, "Generating image with params: \nctx\n---\n%s\ngen\n---\n%s\n",
+            sd_ctx_params_to_str(&ctx_params),
+            sd_img_gen_params_to_str(p));
+
    results = generate_image(sd_c, p);

    std::free(p);
@@ -477,9 +702,12 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
    fprintf (stderr, "Channel: %d\n", results[0].channel);
    fprintf (stderr, "Data: %p\n", results[0].data);

-    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
-                       results[0].data, 0, NULL);
-    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
+    int ret = stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
+                             results[0].data, 0, NULL);
+    if (ret)
+      fprintf (stderr, "Saved resulting image to '%s'\n", dst);
+    else
+      fprintf(stderr, "Failed to write image to '%s'\n", dst);

    // Clean up
    free(results[0].data);
@@ -490,9 +718,10 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
    for (auto buffer : ref_image_buffers) {
        if (buffer) free(buffer);
    }
-    fprintf (stderr, "gen_image is done: %s", dst);
+    fprintf (stderr, "gen_image is done: %s\n", dst);
+    fflush(stderr);

-    return 0;
+    return !ret;
 }

 int unload() {
--- a/backend/go/stablediffusion-ggml/gosd.go
+++ b/backend/go/stablediffusion-ggml/gosd.go
@@ -22,7 +22,7 @@ type SDGGML struct {

 var (
 	LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int
-	GenImage  func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []string, refImagesCount int) int
+	GenImage  func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []uintptr, refImagesCount int) int

 	TilingParamsSetEnabled       func(params uintptr, enabled bool)
 	TilingParamsSetTileSizes     func(params uintptr, tileSizeX int, tileSizeY int)
@@ -95,12 +95,12 @@ func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
 	sd.cfgScale = opts.CFGScale

 	ret := LoadModel(modelFile, modelPathC, options, opts.Threads, diffusionModel)
+	runtime.KeepAlive(keepAlive)
+	fmt.Fprintf(os.Stderr, "LoadModel: %d\n", ret)
 	if ret != 0 {
 		return fmt.Errorf("could not load model")
 	}

-	runtime.KeepAlive(keepAlive)
-
 	return nil
 }

@@ -123,10 +123,15 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 		}
 	}

+	// At the time of writing Purego doesn't recurse into slices and convert Go strings to pointers so we need to do that
+	var keepAlive []any
 	refImagesCount := len(opts.RefImages)
-	refImages := make([]string, refImagesCount, refImagesCount+1)
-	copy(refImages, opts.RefImages)
-	*(*uintptr)(unsafe.Add(unsafe.Pointer(&refImages), refImagesCount)) = 0
+	refImages := make([]uintptr, refImagesCount, refImagesCount+1)
+	for i, ri := range opts.RefImages {
+		bytep := CString(ri)
+		refImages[i] = uintptr(unsafe.Pointer(bytep))
+		keepAlive = append(keepAlive, bytep)
+	}

 	// Default strength for img2img (0.75 is a good default)
 	strength := float32(0.75)
@@ -140,6 +145,8 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 	TilingParamsSetEnabled(vaep, false)

 	ret := GenImage(p, int(opts.Step), dst, sd.cfgScale, srcImage, strength, maskImage, refImages, refImagesCount)
+	runtime.KeepAlive(keepAlive)
+	fmt.Fprintf(os.Stderr, "GenImage: %d\n", ret)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
--- a/backend/go/stablediffusion-ggml/gosd.h
+++ b/backend/go/stablediffusion-ggml/gosd.h
@@ -17,7 +17,7 @@ void sd_img_gen_params_set_dimensions(sd_img_gen_params_t *params, int width, in
 void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed);

 int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel);
-int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
+int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count);
 #ifdef __cplusplus
 }
 #endif
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -20911,6 +20911,9 @@
  overrides:
    parameters:
      model: flux1-dev-Q2_K.gguf
+    options:
+      - scheduler:simple
+      - keep_clip_on_cpu:true
  files:
    - filename: "flux1-dev-Q2_K.gguf"
      sha256: "b8c464bc0f10076ef8f00ba040d220d90c7993f7c4245ae80227d857f65df105"
@@ -21078,6 +21081,32 @@
    - filename: t5xxl_fp16.safetensors
      sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635
      uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors
+- &zimage
+  name: Z-Image-Turbo
+  icon: https://z-image.ai/logo.png
+  license: apache-2.0
+  description: |
+    Z-Image is a powerful and highly efficient image generation model with 6B parameters. Currently there are three variants of which this is the Turbo edition.
+
+    🚀 Z-Image-Turbo – A distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers ⚡️sub-second inference latency⚡️ on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
+  urls:
+    - https://github.com/Tongyi-MAI/Z-Image
+  tags:
+    - text-to-image
+    - z-image
+    - gpu
+  url: "github:mudler/LocalAI/gallery/z-image-ggml.yaml@master"
+  files:
+    - filename: Qwen3-4B.Q4_K_M.gguf
+      sha256: a37931937683a723ae737a0c6fc67dab7782fd8a1b9dea2ca445b7a1dbd5ca3a
+      uri: huggingface://MaziyarPanahi/Qwen3-4B-GGUF/Qwen3-4B.Q4_K_M.gguf
+    - filename: z_image_turbo-Q4_0.gguf
+      sha256: 14b375ab4f226bc5378f68f37e899ef3c2242b8541e61e2bc1aff40976086fbd
+      uri: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/resolve/main/z_image_turbo-Q4_0.gguf
+    - filename: ae.safetensors
+      sha256: afc8e28272cd15db3919bacdb6918ce9c1ed22e96cb12c4d5ed0fba823529e38
+      uri: https://huggingface.co/ChuckMcSneed/FLUX.1-dev/resolve/main/ae.safetensors
+
 - &whisper
  url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master" ## Whisper
  name: "whisper-1"
--- a/gallery/z-image-ggml.yaml
+++ b/gallery/z-image-ggml.yaml
@@ -0,0 +1,15 @@
+---
+name: "Z-Image-GGML"
+
+config_file: |
+  backend: stablediffusion-ggml
+  cfg_scale: 1
+  name: z-image-test
+  options:
+      - diffusion_model
+      - llm_path:Qwen3-4B.Q4_K_M.gguf
+      - vae_path:ae.safetensors
+      - offload_params_to_cpu:true
+  parameters:
+      model: z_image_turbo-Q4_K.gguf
+  step: 25