From fea95229827258364abf34360a8f614a3e271a59 Mon Sep 17 00:00:00 2001 From: fakezeta Date: Tue, 7 May 2024 08:38:58 +0200 Subject: [PATCH] fix: OpenVINO winograd always disabled (#2252) Winograd convolutions were always disabled giving error when inference device was CPU. This commit implement logic to disable Winograd convolutions only if CPU or NPU are declared. --- .../transformers/transformers_server.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/backend/python/transformers/transformers_server.py b/backend/python/transformers/transformers_server.py index f40b8951a..b1e0d5599 100755 --- a/backend/python/transformers/transformers_server.py +++ b/backend/python/transformers/transformers_server.py @@ -150,11 +150,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): devices = Core().available_devices if "GPU" in " ".join(devices): device_map="AUTO:GPU" - + # While working on a fine tuned model, inference may give an inaccuracy and performance drop on GPU if winograd convolutions are selected. + # https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html + if "CPU" or "NPU" in device_map: + if "-CPU" or "-NPU" not in device_map: + ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"} + else: + ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} self.model = OVModelForCausalLM.from_pretrained(model_name, compile=True, trust_remote_code=request.TrustRemoteCode, - ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}, + ov_config=ovconfig, device=device_map) self.OV = True elif request.Type == "OVModelForFeatureExtraction": @@ -168,11 +174,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): devices = Core().available_devices if "GPU" in " ".join(devices): device_map="AUTO:GPU" - + # While working on a fine tuned model, inference may give an inaccuracy and performance drop on GPU if winograd convolutions are selected. + # https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html + if "CPU" or "NPU" in device_map: + if "-CPU" or "-NPU" not in device_map: + ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"} + else: + ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} self.model = OVModelForFeatureExtraction.from_pretrained(model_name, compile=True, trust_remote_code=request.TrustRemoteCode, - ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT", "GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}, + ov_config=ovconfig, export=True, device=device_map) self.OV = True @@ -234,8 +246,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) - print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) - print("Embeddings:", sentence_embeddings, file=sys.stderr) +# print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) +# print("Embeddings:", sentence_embeddings, file=sys.stderr) return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0]) async def _predict(self, request, context, streaming=False):