fix: Diffusers and XPU fixes (#5737)

* fix(README): Add device flags for Intel/XPU Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(diffusers/xpu): Set device to XPU and ignore CUDA request when on Intel Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
2025-12-30 22:20:20 -06:00 · 2025-07-01 11:36:17 +01:00
parent 9f957d547d
commit b37cef3718
3 changed files with 16 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -141,10 +141,10 @@ docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri

 ```bash
 # Intel GPU with FP16 support
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel-f16

 # Intel GPU with FP32 support
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel-f32
 ```

 ### Vulkan GPU Images:
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -38,9 +38,7 @@ DISABLE_CPU_OFFLOAD = os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES = os.environ.get("FRAMES", "64")

 if XPU:
-    import intel_extension_for_pytorch as ipex
-
-    print(ipex.xpu.get_device_name(0))
+    print(torch.xpu.get_device_name(0))

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
@@ -336,6 +334,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                request.LoraAdapter = os.path.join(request.ModelPath, request.LoraAdapter)

            device = "cpu" if not request.CUDA else "cuda"
+            if XPU:
+                device = "xpu"
            self.device = device
            if request.LoraAdapter:
                # Check if its a local file and not a directory ( we load lora differently for a safetensor file )
@@ -359,12 +359,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

                self.pipe.set_adapters(adapters_name, adapter_weights=adapters_weights)

-            if request.CUDA:
-                self.pipe.to('cuda')
+            if device != "cpu":
+                self.pipe.to(device)
                if self.controlnet:
-                    self.controlnet.to('cuda')
-            if XPU:
-                self.pipe = self.pipe.to("xpu")
+                    self.controlnet.to(device)
+
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@@ -6,4 +6,10 @@ else
    source $backend_dir/../common/libbackend.sh
 fi

-startBackend $@
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+fi
+
+startBackend $@