diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py index 6811c142..8a5fb00b 100644 --- a/libs/python/agent/agent/adapters/models/__init__.py +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -22,7 +22,7 @@ def load_model(model_name: str, device: str = "auto"): ) cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True) cls = cfg.__class__.__name__ - print(f"cls: {cls}") + # print(f"cls: {cls}") if "OpenCUA" in cls: return OpenCUAModel(model_name=model_name, device=device) return GenericHFModel(model_name=model_name, device=device) diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py index f24dfa6b..f8abf4a6 100644 --- a/libs/python/agent/agent/adapters/models/opencua.py +++ b/libs/python/agent/agent/adapters/models/opencua.py @@ -37,6 +37,7 @@ class OpenCUAModel: torch_dtype="auto", device_map=self.device, trust_remote_code=True, + attn_implementation="sdpa", ) self.image_processor = AutoImageProcessor.from_pretrained( self.model_name, trust_remote_code=True diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py index c13875b2..a494377b 100644 --- a/libs/python/agent/agent/loops/opencua.py +++ b/libs/python/agent/agent/loops/opencua.py @@ -97,7 +97,7 @@ class OpenCUAConfig(AsyncAgentConfig): }, { "type": "text", - "text": instruction + "text": f"Click on {instruction}" } ] } @@ -116,8 +116,7 @@ class OpenCUAConfig(AsyncAgentConfig): # Extract response text output_text = response.choices[0].message.content - - print(output_text) + # print(output_text) # Extract coordinates from pyautogui format coordinates = extract_coordinates_from_pyautogui(output_text) diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index 811c3a9c..0d382fdf 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -50,7 +50,7 @@ glm45v-hf = [ opencua-hf = [ "accelerate", "torch", - "transformers>=4.54.0", + "transformers==4.53.0", "tiktoken>=0.11.0", "blobfile>=3.0.0" ] @@ -75,7 +75,7 @@ all = [ "transformers>=4.54.0", # opencua requirements "tiktoken>=0.11.0", - "blobfile>=3.0.0" + "blobfile>=3.0.0", # ui requirements "gradio>=5.23.3", "python-dotenv>=1.0.1",