From fea95229827258364abf34360a8f614a3e271a59 Mon Sep 17 00:00:00 2001
From: fakezeta <fakezeta@gmail.com>
Date: Tue, 7 May 2024 08:38:58 +0200
Subject: [PATCH] fix: OpenVINO winograd always disabled (#2252)

Winograd convolutions were always disabled giving error when inference device was CPU.
This commit implement logic to disable Winograd convolutions only if CPU or NPU are declared.
---
 .../transformers/transformers_server.py       | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/backend/python/transformers/transformers_server.py b/backend/python/transformers/transformers_server.py
index f40b8951a..b1e0d5599 100755
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -150,11 +150,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                     devices = Core().available_devices
                     if "GPU" in " ".join(devices):
                         device_map="AUTO:GPU"
-
+                # While working on a fine tuned model, inference may give an inaccuracy and performance drop on GPU if winograd convolutions are selected. 
+                # https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html
+                if "CPU" or "NPU" in device_map:
+                    if "-CPU" or "-NPU" not in device_map:
+                        ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"}
+                else:
+                    ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
                 self.model = OVModelForCausalLM.from_pretrained(model_name, 
                                                                 compile=True,
                                                                 trust_remote_code=request.TrustRemoteCode,
-                                                                ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}, 
+                                                                ov_config=ovconfig,
                                                                 device=device_map)
                 self.OV = True
             elif request.Type == "OVModelForFeatureExtraction":
@@ -168,11 +174,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                     devices = Core().available_devices
                     if "GPU" in " ".join(devices):
                         device_map="AUTO:GPU"
-
+                # While working on a fine tuned model, inference may give an inaccuracy and performance drop on GPU if winograd convolutions are selected. 
+                # https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html
+                if "CPU" or "NPU" in device_map:
+                    if "-CPU" or "-NPU" not in device_map:
+                        ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"}
+                else:
+                    ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
                 self.model = OVModelForFeatureExtraction.from_pretrained(model_name, 
                                                                 compile=True,
                                                                 trust_remote_code=request.TrustRemoteCode,
-                                                                ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT", "GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}, 
+                                                                ov_config=ovconfig, 
                                                                 export=True,
                                                                 device=device_map)
                 self.OV = True
@@ -234,8 +246,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
         sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-        print("Embeddings:", sentence_embeddings, file=sys.stderr)
+#        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
+#        print("Embeddings:", sentence_embeddings, file=sys.stderr)
         return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
 
     async def _predict(self, request, context, streaming=False):