diff --git a/backend/backend.proto b/backend/backend.proto index 4acd8504d..77bf3fefa 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -305,6 +305,9 @@ message GenerateImageRequest { // Diffusers string EnableParameters = 10; int32 CLIPSkip = 11; + + // Reference images for models that support them (e.g., Flux Kontext) + repeated string ref_images = 12; } message GenerateVideoRequest { diff --git a/backend/go/stablediffusion-ggml/gosd.cpp b/backend/go/stablediffusion-ggml/gosd.cpp index e3e665d7a..12e355a42 100644 --- a/backend/go/stablediffusion-ggml/gosd.cpp +++ b/backend/go/stablediffusion-ggml/gosd.cpp @@ -198,7 +198,7 @@ int load_model(char *model, char* options[], int threads, int diff) { return 0; } -int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) { +int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) { sd_image_t* results; @@ -221,15 +221,187 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps, p.seed = seed; p.input_id_images_path = ""; + // Handle input image for img2img + bool has_input_image = (src_image != NULL && strlen(src_image) > 0); + bool has_mask_image = (mask_image != NULL && strlen(mask_image) > 0); + + uint8_t* input_image_buffer = NULL; + uint8_t* mask_image_buffer = NULL; + std::vector default_mask_image_vec; + + if (has_input_image) { + fprintf(stderr, "Loading input image: %s\n", src_image); + + int c = 0; + int img_width = 0; + int img_height = 0; + input_image_buffer = stbi_load(src_image, &img_width, &img_height, &c, 3); + if (input_image_buffer == NULL) { + fprintf(stderr, "Failed to load input image from '%s'\n", src_image); + return 1; + } + if (c < 3) { + fprintf(stderr, "Input image must have at least 3 channels, got %d\n", c); + free(input_image_buffer); + return 1; + } + + // Resize input image if dimensions don't match + if (img_width != width || img_height != height) { + fprintf(stderr, "Resizing input image from %dx%d to %dx%d\n", img_width, img_height, width, height); + + uint8_t* resized_image_buffer = (uint8_t*)malloc(height * width * 3); + if (resized_image_buffer == NULL) { + fprintf(stderr, "Failed to allocate memory for resized image\n"); + free(input_image_buffer); + return 1; + } + + stbir_resize(input_image_buffer, img_width, img_height, 0, + resized_image_buffer, width, height, 0, STBIR_TYPE_UINT8, + 3, STBIR_ALPHA_CHANNEL_NONE, 0, + STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, + STBIR_FILTER_BOX, STBIR_FILTER_BOX, + STBIR_COLORSPACE_SRGB, nullptr); + + free(input_image_buffer); + input_image_buffer = resized_image_buffer; + } + + p.init_image = {(uint32_t)width, (uint32_t)height, 3, input_image_buffer}; + p.strength = strength; + fprintf(stderr, "Using img2img with strength: %.2f\n", strength); + } else { + // No input image, use empty image for text-to-image + p.init_image = {(uint32_t)width, (uint32_t)height, 3, NULL}; + p.strength = 0.0f; + } + + // Handle mask image for inpainting + if (has_mask_image) { + fprintf(stderr, "Loading mask image: %s\n", mask_image); + + int c = 0; + int mask_width = 0; + int mask_height = 0; + mask_image_buffer = stbi_load(mask_image, &mask_width, &mask_height, &c, 1); + if (mask_image_buffer == NULL) { + fprintf(stderr, "Failed to load mask image from '%s'\n", mask_image); + if (input_image_buffer) free(input_image_buffer); + return 1; + } + + // Resize mask if dimensions don't match + if (mask_width != width || mask_height != height) { + fprintf(stderr, "Resizing mask image from %dx%d to %dx%d\n", mask_width, mask_height, width, height); + + uint8_t* resized_mask_buffer = (uint8_t*)malloc(height * width); + if (resized_mask_buffer == NULL) { + fprintf(stderr, "Failed to allocate memory for resized mask\n"); + free(mask_image_buffer); + if (input_image_buffer) free(input_image_buffer); + return 1; + } + + stbir_resize(mask_image_buffer, mask_width, mask_height, 0, + resized_mask_buffer, width, height, 0, STBIR_TYPE_UINT8, + 1, STBIR_ALPHA_CHANNEL_NONE, 0, + STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, + STBIR_FILTER_BOX, STBIR_FILTER_BOX, + STBIR_COLORSPACE_SRGB, nullptr); + + free(mask_image_buffer); + mask_image_buffer = resized_mask_buffer; + } + + p.mask_image = {(uint32_t)width, (uint32_t)height, 1, mask_image_buffer}; + fprintf(stderr, "Using inpainting with mask\n"); + } else { + // No mask image, create default full mask + default_mask_image_vec.resize(width * height, 255); + p.mask_image = {(uint32_t)width, (uint32_t)height, 1, default_mask_image_vec.data()}; + } + + // Handle reference images + std::vector ref_images_vec; + std::vector ref_image_buffers; + + if (ref_images_count > 0 && ref_images != NULL) { + fprintf(stderr, "Loading %d reference images\n", ref_images_count); + + for (int i = 0; i < ref_images_count; i++) { + if (ref_images[i] == NULL || strlen(ref_images[i]) == 0) { + continue; + } + + fprintf(stderr, "Loading reference image %d: %s\n", i + 1, ref_images[i]); + + int c = 0; + int ref_width = 0; + int ref_height = 0; + uint8_t* ref_image_buffer = stbi_load(ref_images[i], &ref_width, &ref_height, &c, 3); + if (ref_image_buffer == NULL) { + fprintf(stderr, "Failed to load reference image from '%s'\n", ref_images[i]); + continue; + } + if (c < 3) { + fprintf(stderr, "Reference image must have at least 3 channels, got %d\n", c); + free(ref_image_buffer); + continue; + } + + // Resize reference image if dimensions don't match + if (ref_width != width || ref_height != height) { + fprintf(stderr, "Resizing reference image from %dx%d to %dx%d\n", ref_width, ref_height, width, height); + + uint8_t* resized_ref_buffer = (uint8_t*)malloc(height * width * 3); + if (resized_ref_buffer == NULL) { + fprintf(stderr, "Failed to allocate memory for resized reference image\n"); + free(ref_image_buffer); + continue; + } + + stbir_resize(ref_image_buffer, ref_width, ref_height, 0, + resized_ref_buffer, width, height, 0, STBIR_TYPE_UINT8, + 3, STBIR_ALPHA_CHANNEL_NONE, 0, + STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, + STBIR_FILTER_BOX, STBIR_FILTER_BOX, + STBIR_COLORSPACE_SRGB, nullptr); + + free(ref_image_buffer); + ref_image_buffer = resized_ref_buffer; + } + + ref_image_buffers.push_back(ref_image_buffer); + ref_images_vec.push_back({(uint32_t)width, (uint32_t)height, 3, ref_image_buffer}); + } + + if (!ref_images_vec.empty()) { + p.ref_images = ref_images_vec.data(); + p.ref_images_count = ref_images_vec.size(); + fprintf(stderr, "Using %zu reference images\n", ref_images_vec.size()); + } + } + results = generate_image(sd_c, &p); if (results == NULL) { fprintf (stderr, "NO results\n"); + if (input_image_buffer) free(input_image_buffer); + if (mask_image_buffer) free(mask_image_buffer); + for (auto buffer : ref_image_buffers) { + if (buffer) free(buffer); + } return 1; } if (results[0].data == NULL) { fprintf (stderr, "Results with no data\n"); + if (input_image_buffer) free(input_image_buffer); + if (mask_image_buffer) free(mask_image_buffer); + for (auto buffer : ref_image_buffers) { + if (buffer) free(buffer); + } return 1; } @@ -245,11 +417,15 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps, results[0].data, 0, NULL); fprintf (stderr, "Saved resulting image to '%s'\n", dst); - // TODO: free results. Why does it crash? - + // Clean up free(results[0].data); results[0].data = NULL; free(results); + if (input_image_buffer) free(input_image_buffer); + if (mask_image_buffer) free(mask_image_buffer); + for (auto buffer : ref_image_buffers) { + if (buffer) free(buffer); + } fprintf (stderr, "gen_image is done", dst); return 0; diff --git a/backend/go/stablediffusion-ggml/gosd.go b/backend/go/stablediffusion-ggml/gosd.go index fa4d0e72e..43f01fee2 100644 --- a/backend/go/stablediffusion-ggml/gosd.go +++ b/backend/go/stablediffusion-ggml/gosd.go @@ -38,7 +38,7 @@ func (sd *SDGGML) Load(opts *pb.ModelOptions) error { size := C.size_t(unsafe.Sizeof((*C.char)(nil))) length := C.size_t(len(opts.Options)) options = (**C.char)(C.malloc((length + 1) * size)) - view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options) + 1:len(opts.Options) + 1] + view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0 : len(opts.Options)+1 : len(opts.Options)+1] var diffusionModel int @@ -88,7 +88,56 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error { negative := C.CString(opts.NegativePrompt) defer C.free(unsafe.Pointer(negative)) - ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale)) + // Handle source image path + var srcImage *C.char + if opts.Src != "" { + srcImage = C.CString(opts.Src) + defer C.free(unsafe.Pointer(srcImage)) + } + + // Handle mask image path + var maskImage *C.char + if opts.EnableParameters != "" { + // Parse EnableParameters for mask path if provided + // This is a simple approach - in a real implementation you might want to parse JSON + if strings.Contains(opts.EnableParameters, "mask:") { + parts := strings.Split(opts.EnableParameters, "mask:") + if len(parts) > 1 { + maskPath := strings.TrimSpace(parts[1]) + if maskPath != "" { + maskImage = C.CString(maskPath) + defer C.free(unsafe.Pointer(maskImage)) + } + } + } + } + + // Handle reference images + var refImages **C.char + var refImagesCount C.int + if len(opts.RefImages) > 0 { + refImagesCount = C.int(len(opts.RefImages)) + // Allocate array of C strings + size := C.size_t(unsafe.Sizeof((*C.char)(nil))) + refImages = (**C.char)(C.malloc((C.size_t(len(opts.RefImages)) + 1) * size)) + view := (*[1 << 30]*C.char)(unsafe.Pointer(refImages))[0 : len(opts.RefImages)+1 : len(opts.RefImages)+1] + + for i, refImagePath := range opts.RefImages { + view[i] = C.CString(refImagePath) + defer C.free(unsafe.Pointer(view[i])) + } + view[len(opts.RefImages)] = nil + } + + // Default strength for img2img (0.75 is a good default) + strength := C.float(0.75) + if opts.Src != "" { + // If we have a source image, use img2img mode + // You could also parse strength from EnableParameters if needed + strength = C.float(0.75) + } + + ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale), srcImage, strength, maskImage, refImages, refImagesCount) if ret != 0 { return fmt.Errorf("inference failed") } diff --git a/backend/go/stablediffusion-ggml/gosd.h b/backend/go/stablediffusion-ggml/gosd.h index 5297e8711..8208bd27a 100644 --- a/backend/go/stablediffusion-ggml/gosd.h +++ b/backend/go/stablediffusion-ggml/gosd.h @@ -2,7 +2,7 @@ extern "C" { #endif int load_model(char *model, char* options[], int threads, int diffusionModel); -int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale); +int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count); #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/core/backend/image.go b/core/backend/image.go index 4b34f2cf4..9f838a373 100644 --- a/core/backend/image.go +++ b/core/backend/image.go @@ -7,7 +7,7 @@ import ( model "github.com/mudler/LocalAI/pkg/model" ) -func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) { +func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) { opts := ModelOptions(backendConfig, appConfig) inferenceModel, err := loader.Load( @@ -33,6 +33,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat Dst: dst, Src: src, EnableParameters: backendConfig.Diffusers.EnableParameters, + RefImages: refImages, }) return err } diff --git a/core/http/endpoints/openai/image.go b/core/http/endpoints/openai/image.go index 3ac07cdc5..fa641d8fb 100644 --- a/core/http/endpoints/openai/image.go +++ b/core/http/endpoints/openai/image.go @@ -79,49 +79,37 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon return fiber.ErrBadRequest } + // Process input images (for img2img/inpainting) src := "" if input.File != "" { + src = processImageFile(input.File, appConfig.GeneratedContentDir) + if src != "" { + defer os.RemoveAll(src) + } + } - fileData := []byte{} - var err error - // check if input.File is an URL, if so download it and save it - // to a temporary file - if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") { - out, err := downloadFile(input.File) - if err != nil { - return fmt.Errorf("failed downloading file:%w", err) - } - defer os.RemoveAll(out) - - fileData, err = os.ReadFile(out) - if err != nil { - return fmt.Errorf("failed reading file:%w", err) - } - - } else { - // base 64 decode the file and write it somewhere - // that we will cleanup - fileData, err = base64.StdEncoding.DecodeString(input.File) - if err != nil { - return err + // Process multiple input images + var inputImages []string + if len(input.Files) > 0 { + for _, file := range input.Files { + processedFile := processImageFile(file, appConfig.GeneratedContentDir) + if processedFile != "" { + inputImages = append(inputImages, processedFile) + defer os.RemoveAll(processedFile) } } + } - // Create a temporary file - outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64") - if err != nil { - return err + // Process reference images + var refImages []string + if len(input.RefImages) > 0 { + for _, file := range input.RefImages { + processedFile := processImageFile(file, appConfig.GeneratedContentDir) + if processedFile != "" { + refImages = append(refImages, processedFile) + defer os.RemoveAll(processedFile) + } } - // write the base64 result - writer := bufio.NewWriter(outputFile) - _, err = writer.Write(fileData) - if err != nil { - outputFile.Close() - return err - } - outputFile.Close() - src = outputFile.Name() - defer os.RemoveAll(src) } log.Debug().Msgf("Parameter Config: %+v", config) @@ -202,7 +190,13 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon baseURL := c.BaseURL() - fn, err := backend.ImageGeneration(height, width, mode, step, *config.Seed, positive_prompt, negative_prompt, src, output, ml, *config, appConfig) + // Use the first input image as src if available, otherwise use the original src + inputSrc := src + if len(inputImages) > 0 { + inputSrc = inputImages[0] + } + + fn, err := backend.ImageGeneration(height, width, mode, step, *config.Seed, positive_prompt, negative_prompt, inputSrc, output, ml, *config, appConfig, refImages) if err != nil { return err } @@ -243,3 +237,51 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon return c.JSON(resp) } } + +// processImageFile handles a single image file (URL or base64) and returns the path to the temporary file +func processImageFile(file string, generatedContentDir string) string { + fileData := []byte{} + var err error + + // check if file is an URL, if so download it and save it to a temporary file + if strings.HasPrefix(file, "http://") || strings.HasPrefix(file, "https://") { + out, err := downloadFile(file) + if err != nil { + log.Error().Err(err).Msgf("Failed downloading file: %s", file) + return "" + } + defer os.RemoveAll(out) + + fileData, err = os.ReadFile(out) + if err != nil { + log.Error().Err(err).Msgf("Failed reading downloaded file: %s", out) + return "" + } + } else { + // base 64 decode the file and write it somewhere that we will cleanup + fileData, err = base64.StdEncoding.DecodeString(file) + if err != nil { + log.Error().Err(err).Msgf("Failed decoding base64 file") + return "" + } + } + + // Create a temporary file + outputFile, err := os.CreateTemp(generatedContentDir, "b64") + if err != nil { + log.Error().Err(err).Msg("Failed creating temporary file") + return "" + } + + // write the base64 result + writer := bufio.NewWriter(outputFile) + _, err = writer.Write(fileData) + if err != nil { + outputFile.Close() + log.Error().Err(err).Msg("Failed writing to temporary file") + return "" + } + outputFile.Close() + + return outputFile.Name() +} diff --git a/core/schema/openai.go b/core/schema/openai.go index c54b52eb8..44b54d188 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -141,6 +141,10 @@ type OpenAIRequest struct { // whisper File string `json:"file" validate:"required"` + // Multiple input images for img2img or inpainting + Files []string `json:"files,omitempty"` + // Reference images for models that support them (e.g., Flux Kontext) + RefImages []string `json:"ref_images,omitempty"` //whisper/image ResponseFormat interface{} `json:"response_format,omitempty"` // image diff --git a/gallery/index.yaml b/gallery/index.yaml index 07e0405e4..36638cd4e 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -19144,6 +19144,43 @@ overrides: parameters: model: SicariusSicariiStuff/flux.1dev-abliteratedv2 +- name: flux.1-kontext-dev + license: flux-1-dev-non-commercial-license + url: "github:mudler/LocalAI/gallery/flux-ggml.yaml@master" + icon: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/media/main/teaser.png + description: | + FLUX.1 Kontext [dev] is a 12 billion parameter rectified flow transformer capable of editing images based on text instructions. For more information, please read our blog post and our technical report. You can find information about the [pro] version in here. + Key Features + Change existing images based on an edit instruction. + Have character, style and object reference without any finetuning. + Robust consistency allows users to refine an image through multiple successive edits with minimal visual drift. + Trained using guidance distillation, making FLUX.1 Kontext [dev] more efficient. + Open weights to drive new scientific research, and empower artists to develop innovative workflows. + Generated outputs can be used for personal, scientific, and commercial purposes, as described in the FLUX.1 [dev] Non-Commercial License. + urls: + - https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev + - https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF + tags: + - image-to-image + - flux + - gpu + - cpu + overrides: + parameters: + model: flux1-kontext-dev-Q8_0.gguf + files: + - filename: "flux1-kontext-dev-Q8_0.gguf" + sha256: "ff2ff71c3755c8ab394398a412252c23382a83138b65190b16e736d457b80f73" + uri: "huggingface://QuantStack/FLUX.1-Kontext-dev-GGUF/flux1-kontext-dev-Q8_0.gguf" + - filename: ae.safetensors + sha256: afc8e28272cd15db3919bacdb6918ce9c1ed22e96cb12c4d5ed0fba823529e38 + uri: https://huggingface.co/ChuckMcSneed/FLUX.1-dev/resolve/main/ae.safetensors + - filename: clip_l.safetensors + sha256: 660c6f5b1abae9dc498ac2d21e1347d2abdb0cf6c0c0c8576cd796491d9a6cdd + uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors + - filename: t5xxl_fp16.safetensors + sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635 + uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors - &whisper url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master" ## Whisper name: "whisper-1"