workaround for large overlapping bboxes

2026-01-03 03:49:58 -06:00 · 2025-03-27 13:22:55 -04:00
parent 85404cccab
commit 0836bb7e8f
2 changed files with 35 additions and 12 deletions
--- a/libs/som/som/detect.py
+++ b/libs/som/som/detect.py
@@ -232,17 +232,31 @@ class OmniParser:
                    ],
                )
                
-                # Merge detections using NMS
                if elements and text_elements:
-                    # Get all bounding boxes and scores
+                    # Filter out non-OCR elements that have OCR elements with center points colliding with them
+                    filtered_elements = []
+                    for elem in elements:  # elements at this point contains only non-OCR elements
+                        should_keep = True
+                        for text_elem in text_elements:
+                            # Calculate center point of the text element
+                            center_x = (text_elem.bbox.x1 + text_elem.bbox.x2) / 2
+                            center_y = (text_elem.bbox.y1 + text_elem.bbox.y2) / 2
+                            
+                            # Check if this center point is inside the non-OCR element
+                            if (center_x >= elem.bbox.x1 and center_x <= elem.bbox.x2 and 
+                                center_y >= elem.bbox.y1 and center_y <= elem.bbox.y2):
+                                should_keep = False
+                                break
+                        
+                        if should_keep:
+                            filtered_elements.append(elem)
+                    elements = filtered_elements
+                    
+                    # Merge detections using NMS
                    all_elements = elements + text_elements
                    boxes = torch.tensor([elem.bbox.coordinates for elem in all_elements])
                    scores = torch.tensor([elem.confidence for elem in all_elements])
-                    
-                    # Apply NMS with iou_threshold
                    keep_indices = torchvision.ops.nms(boxes, scores, iou_threshold)
-                    
-                    # Keep only the elements that passed NMS
                    elements = [all_elements[i] for i in keep_indices]
                else:
                    # Just add text elements to the list if IOU doesn't need to be applied
--- a/libs/som/som/visualization.py
+++ b/libs/som/som/visualization.py
@@ -174,22 +174,31 @@ class BoxAnnotator:
                lambda: (x1 - box_width - spacing, y2 + spacing),
            ]

-            def check_collision(x, y):
-                """Check if a label box collides with any existing ones or is inside bbox."""
+            def check_occlusion(x, y):
+                """Check if a label box occludes any existing ones or is inside bbox."""
                # First check if it's inside the bounding box
                if is_inside_bbox(x, y):
                    return True

                # Then check collision with other labels
                new_box = (x, y, x + box_width, y + box_height)
+                label_width = new_box[2] - new_box[0]
+                label_height = new_box[3] - new_box[1]
+                
                for used_box in used_areas:
                    if not (
                        new_box[2] < used_box[0]  # new box is left of used box
                        or new_box[0] > used_box[2]  # new box is right of used box
                        or new_box[3] < used_box[1]  # new box is above used box
-                        or new_box[1] > used_box[3]
-                    ):  # new box is below used box
-                        return True
+                        or new_box[1] > used_box[3]  # new box is below used box
+                    ):
+                        # Calculate dimensions of the used box
+                        used_box_width = used_box[2] - used_box[0]
+                        used_box_height = used_box[3] - used_box[1]
+                        
+                        # Only consider as collision if used box is NOT more than 5x bigger in both dimensions
+                        if not (used_box_width > 5 * label_width and used_box_height > 5 * label_height):
+                            return True
                return False

            # Try each position until we find one without collision
@@ -201,7 +210,7 @@ class BoxAnnotator:
                # Ensure position is within image bounds
                if x < 0 or y < 0 or x + box_width > image.width or y + box_height > image.height:
                    continue
-                if not check_collision(x, y):
+                if not check_occlusion(x, y):
                    label_x = x
                    label_y = y
                    break