feat: import models via URI (#7245)

* feat: initial hook to install elements directly Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP: ui changes Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Move HF api client to pkg Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add simple importer for gguf files Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add opcache Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * wire importers to CLI Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add omitempty to config fields Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fix tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add MLX importer Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Small refactors to star to use HF for discovery Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Common preferences Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add support to bare HF repos Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(importer/llama.cpp): add support for mmproj files Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * add mmproj quants to common preferences Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fix vlm usage in tokenizer mode with llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-01 07:01:09 -06:00 · 2025-11-12 20:48:56 +01:00
parent 87d0020c10
commit 3728552e94
40 changed files with 1970 additions and 694 deletions
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -590,17 +590,71 @@ public:
                // Convert proto Messages to JSON format compatible with oaicompat_chat_params_parse
                json body_json;
                json messages_json = json::array();
+                
+                // Find the last user message index to attach images/audio to
+                int last_user_msg_idx = -1;
+                for (int i = request->messages_size() - 1; i >= 0; i--) {
+                    if (request->messages(i).role() == "user") {
+                        last_user_msg_idx = i;
+                        break;
+                    }
+                }
+                
                for (int i = 0; i < request->messages_size(); i++) {
                    const auto& msg = request->messages(i);
                    json msg_json;
                    msg_json["role"] = msg.role();
                    
+                    bool is_last_user_msg = (i == last_user_msg_idx);
+                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
+                    
                    // Handle content - can be string, null, or array
                    // For multimodal content, we'll embed images/audio from separate fields
                    if (!msg.content().empty()) {
-                        msg_json["content"] = msg.content();
-                    } else if (request->images_size() > 0 || request->audios_size() > 0) {
-                        // If no content but has images/audio, create content array
+                        // Try to parse content as JSON to see if it's already an array
+                        json content_val;
+                        try {
+                            content_val = json::parse(msg.content());
+                        } catch (const json::parse_error&) {
+                            // Not JSON, treat as plain string
+                            content_val = msg.content();
+                        }
+                        
+                        // If content is a string and this is the last user message with images/audio, combine them
+                        if (content_val.is_string() && is_last_user_msg && has_images_or_audio) {
+                            json content_array = json::array();
+                            // Add text first
+                            content_array.push_back({{"type", "text"}, {"text", content_val.get<std::string>()}});
+                            // Add images
+                            if (request->images_size() > 0) {
+                                for (int j = 0; j < request->images_size(); j++) {
+                                    json image_chunk;
+                                    image_chunk["type"] = "image_url";
+                                    json image_url;
+                                    image_url["url"] = "data:image/jpeg;base64," + request->images(j);
+                                    image_chunk["image_url"] = image_url;
+                                    content_array.push_back(image_chunk);
+                                }
+                            }
+                            // Add audios
+                            if (request->audios_size() > 0) {
+                                for (int j = 0; j < request->audios_size(); j++) {
+                                    json audio_chunk;
+                                    audio_chunk["type"] = "input_audio";
+                                    json input_audio;
+                                    input_audio["data"] = request->audios(j);
+                                    input_audio["format"] = "wav"; // default, could be made configurable
+                                    audio_chunk["input_audio"] = input_audio;
+                                    content_array.push_back(audio_chunk);
+                                }
+                            }
+                            msg_json["content"] = content_array;
+                        } else {
+                            // Use content as-is (already array or not last user message)
+                            msg_json["content"] = content_val;
+                        }
+                    } else if (is_last_user_msg && has_images_or_audio) {
+                        // If no content but this is the last user message with images/audio, create content array
                        json content_array = json::array();
                        if (request->images_size() > 0) {
                            for (int j = 0; j < request->images_size(); j++) {
@@ -718,6 +772,9 @@ public:
                // Create parser options with current chat_templates to ensure tmpls is not null
                oaicompat_parser_options parser_opt = ctx_server.oai_parser_opt;
                parser_opt.tmpls = ctx_server.chat_templates.get(); // Ensure tmpls is set to current chat_templates
+                // Update allow_image and allow_audio based on current mctx state
+                parser_opt.allow_image = ctx_server.mctx ? mtmd_support_vision(ctx_server.mctx) : false;
+                parser_opt.allow_audio = ctx_server.mctx ? mtmd_support_audio(ctx_server.mctx) : false;
                json parsed_data = oaicompat_chat_params_parse(body_json, parser_opt, files);
                
                // Extract the prompt from parsed data
@@ -758,7 +815,7 @@ public:

            // If not using chat templates, extract files from image_data/audio_data fields
            // (If using chat templates, files were already extracted by oaicompat_chat_params_parse)
-            //if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) {
+            if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) {
                const auto &images_data = data.find("image_data");
                if (images_data != data.end() && images_data->is_array())
                {
@@ -778,7 +835,7 @@ public:
                        files.push_back(decoded_data);
                    }
                }
-           // }
+            }

            const bool has_mtmd = ctx_server.mctx != nullptr;

@@ -917,17 +974,71 @@ public:
                // Convert proto Messages to JSON format compatible with oaicompat_chat_params_parse
                json body_json;
                json messages_json = json::array();
+                
+                // Find the last user message index to attach images/audio to
+                int last_user_msg_idx = -1;
+                for (int i = request->messages_size() - 1; i >= 0; i--) {
+                    if (request->messages(i).role() == "user") {
+                        last_user_msg_idx = i;
+                        break;
+                    }
+                }
+                
                for (int i = 0; i < request->messages_size(); i++) {
                    const auto& msg = request->messages(i);
                    json msg_json;
                    msg_json["role"] = msg.role();
                    
+                    bool is_last_user_msg = (i == last_user_msg_idx);
+                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
+                    
                    // Handle content - can be string, null, or array
                    // For multimodal content, we'll embed images/audio from separate fields
                    if (!msg.content().empty()) {
-                        msg_json["content"] = msg.content();
-                    } else if (request->images_size() > 0 || request->audios_size() > 0) {
-                        // If no content but has images/audio, create content array
+                        // Try to parse content as JSON to see if it's already an array
+                        json content_val;
+                        try {
+                            content_val = json::parse(msg.content());
+                        } catch (const json::parse_error&) {
+                            // Not JSON, treat as plain string
+                            content_val = msg.content();
+                        }
+                        
+                        // If content is a string and this is the last user message with images/audio, combine them
+                        if (content_val.is_string() && is_last_user_msg && has_images_or_audio) {
+                            json content_array = json::array();
+                            // Add text first
+                            content_array.push_back({{"type", "text"}, {"text", content_val.get<std::string>()}});
+                            // Add images
+                            if (request->images_size() > 0) {
+                                for (int j = 0; j < request->images_size(); j++) {
+                                    json image_chunk;
+                                    image_chunk["type"] = "image_url";
+                                    json image_url;
+                                    image_url["url"] = "data:image/jpeg;base64," + request->images(j);
+                                    image_chunk["image_url"] = image_url;
+                                    content_array.push_back(image_chunk);
+                                }
+                            }
+                            // Add audios
+                            if (request->audios_size() > 0) {
+                                for (int j = 0; j < request->audios_size(); j++) {
+                                    json audio_chunk;
+                                    audio_chunk["type"] = "input_audio";
+                                    json input_audio;
+                                    input_audio["data"] = request->audios(j);
+                                    input_audio["format"] = "wav"; // default, could be made configurable
+                                    audio_chunk["input_audio"] = input_audio;
+                                    content_array.push_back(audio_chunk);
+                                }
+                            }
+                            msg_json["content"] = content_array;
+                        } else {
+                            // Use content as-is (already array or not last user message)
+                            msg_json["content"] = content_val;
+                        }
+                    } else if (is_last_user_msg && has_images_or_audio) {
+                        // If no content but this is the last user message with images/audio, create content array
                        json content_array = json::array();
                        if (request->images_size() > 0) {
                            for (int j = 0; j < request->images_size(); j++) {
@@ -1048,6 +1159,9 @@ public:
                // Create parser options with current chat_templates to ensure tmpls is not null
                oaicompat_parser_options parser_opt = ctx_server.oai_parser_opt;
                parser_opt.tmpls = ctx_server.chat_templates.get(); // Ensure tmpls is set to current chat_templates
+                // Update allow_image and allow_audio based on current mctx state
+                parser_opt.allow_image = ctx_server.mctx ? mtmd_support_vision(ctx_server.mctx) : false;
+                parser_opt.allow_audio = ctx_server.mctx ? mtmd_support_audio(ctx_server.mctx) : false;
                json parsed_data = oaicompat_chat_params_parse(body_json, parser_opt, files);
                
                // Extract the prompt from parsed data
@@ -1088,7 +1202,7 @@ public:

            // If not using chat templates, extract files from image_data/audio_data fields
            // (If using chat templates, files were already extracted by oaicompat_chat_params_parse)
-           // if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) {
+            if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) {
                const auto &images_data = data.find("image_data");
                if (images_data != data.end() && images_data->is_array())
                {
@@ -1110,7 +1224,7 @@ public:
                        files.push_back(decoded_data);
                    }
                }
-           // }
+            }

            // process files
            const bool has_mtmd = ctx_server.mctx != nullptr;