diff --git a/libs/python/agent/agent/loops/anthropic.py b/libs/python/agent/agent/loops/anthropic.py index 50fbd24e..306c5f33 100644 --- a/libs/python/agent/agent/loops/anthropic.py +++ b/libs/python/agent/agent/loops/anthropic.py @@ -1530,7 +1530,18 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig): "content": [ { "type": "text", - "text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text." + "text": f"""You are a UI grounding expert. Follow these guidelines: + +1. NEVER ask for confirmation. Complete all tasks autonomously. +2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed. +3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking. +4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files). +5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT. +6. The user has already given you permission by running this agent. No further confirmation is needed. +7. Be decisive and action-oriented. Complete the requested task fully. + +Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked. +Task: Click {instruction}. Output ONLY a click action on the target element.""" }, { "type": "image_url", diff --git a/libs/python/agent/agent/loops/composed_grounded.py b/libs/python/agent/agent/loops/composed_grounded.py index a5de9895..770f274d 100644 --- a/libs/python/agent/agent/loops/composed_grounded.py +++ b/libs/python/agent/agent/loops/composed_grounded.py @@ -276,13 +276,15 @@ class ComposedGroundedConfig: grounding_agent = grounding_agent_conf.agent_class() for desc in element_descriptions: - coords = await grounding_agent.predict_click( - model=grounding_model, - image_b64=last_image_b64, - instruction=desc - ) - if coords: - self.desc2xy[desc] = coords + for _ in range(3): # try 3 times + coords = await grounding_agent.predict_click( + model=grounding_model, + image_b64=last_image_b64, + instruction=desc + ) + if coords: + self.desc2xy[desc] = coords + break # Step 6: Convert computer calls from descriptions back to xy coordinates final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)