Merge pull request #350 from trycua/fix/uitars-error

[Agent] Fix UI-TARS predict_click always returning None
2026-01-08 14:30:25 -06:00 · 2025-08-18 12:36:51 -04:00
parent efe17de80d 86c0da81fa
commit 3a9b5be660
1 changed files with 9 additions and 1 deletions
--- a/libs/python/agent/agent/loops/uitars.py
+++ b/libs/python/agent/agent/loops/uitars.py
@@ -782,11 +782,19 @@ class UITARSConfig:
            # Extract response content
            response_content = response.choices[0].message.content.strip() # type: ignore
            
+            print(response_content)
+
            # Parse the response to extract click coordinates
-            # Look for click action with coordinates
+            # Look for click action with coordinates (with special tokens)
            click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
            match = re.search(click_pattern, response_content)
            
+            # Fallback: Look for simpler format without special tokens
+            if not match:
+                # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
+                fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
+                match = re.search(fallback_pattern, response_content)
+            
            if match:
                x, y = int(match.group(1)), int(match.group(2))
                # Scale coordinates back to original image dimensions