From b37cef371880a8fdbb7d6a41926bf242b0b51eb2 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Tue, 1 Jul 2025 11:36:17 +0100
Subject: [PATCH] fix: Diffusers and XPU fixes (#5737)

* fix(README): Add device flags for Intel/XPU

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(diffusers/xpu): Set device to XPU and ignore CUDA request when on Intel

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 README.md                           |  4 ++--
 backend/python/diffusers/backend.py | 15 +++++++--------
 backend/python/diffusers/run.sh     |  8 +++++++-
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index ab322d255..d3497df56 100644
--- a/README.md
+++ b/README.md
@@ -141,10 +141,10 @@ docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri
 
 ```bash
 # Intel GPU with FP16 support
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel-f16
 
 # Intel GPU with FP32 support
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel-f32
 ```
 
 ### Vulkan GPU Images:
diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py
index 2d8db5338..1a5f1785a 100755
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -38,9 +38,7 @@ DISABLE_CPU_OFFLOAD = os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES = os.environ.get("FRAMES", "64")
 
 if XPU:
-    import intel_extension_for_pytorch as ipex
-
-    print(ipex.xpu.get_device_name(0))
+    print(torch.xpu.get_device_name(0))
 
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
@@ -336,6 +334,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                 request.LoraAdapter = os.path.join(request.ModelPath, request.LoraAdapter)
 
             device = "cpu" if not request.CUDA else "cuda"
+            if XPU:
+                device = "xpu"
             self.device = device
             if request.LoraAdapter:
                 # Check if its a local file and not a directory ( we load lora differently for a safetensor file )
@@ -359,12 +359,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
                 self.pipe.set_adapters(adapters_name, adapter_weights=adapters_weights)
 
-            if request.CUDA:
-                self.pipe.to('cuda')
+            if device != "cpu":
+                self.pipe.to(device)
                 if self.controlnet:
-                    self.controlnet.to('cuda')
-            if XPU:
-                self.pipe = self.pipe.to("xpu")
+                    self.controlnet.to(device)
+
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         # Implement your logic here for the LoadModel service
diff --git a/backend/python/diffusers/run.sh b/backend/python/diffusers/run.sh
index 82b7b09ec..ee730f21f 100755
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@@ -6,4 +6,10 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
-startBackend $@
\ No newline at end of file
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+fi
+
+startBackend $@