Files
computer/libs/python/som/som/models.py

120 lines
4.0 KiB
Python

from typing import List, Tuple, Optional, Literal, Dict, Any, Union
from pydantic import BaseModel, Field, validator
class BoundingBox(BaseModel):
"""Normalized bounding box coordinates."""
x1: float = Field(..., description="Normalized left coordinate")
y1: float = Field(..., description="Normalized top coordinate")
x2: float = Field(..., description="Normalized right coordinate")
y2: float = Field(..., description="Normalized bottom coordinate")
@property
def coordinates(self) -> List[float]:
"""Get coordinates as a list [x1, y1, x2, y2]."""
return [self.x1, self.y1, self.x2, self.y2]
class UIElement(BaseModel):
"""Base class for UI elements."""
id: Optional[int] = Field(None, description="Unique identifier for the element (1-indexed)")
type: Literal["icon", "text"]
bbox: BoundingBox
interactivity: bool = Field(default=False, description="Whether the element is interactive")
confidence: float = Field(default=1.0, description="Detection confidence score")
class IconElement(UIElement):
"""An interactive icon element."""
type: Literal["icon"] = "icon"
interactivity: bool = True
scale: Optional[int] = Field(None, description="Detection scale used")
class TextElement(UIElement):
"""A text element."""
type: Literal["text"] = "text"
content: str = Field(..., description="The text content")
interactivity: bool = False
class ImageData(BaseModel):
"""Image data with dimensions."""
base64: str = Field(..., description="Base64 encoded image data")
width: int = Field(..., description="Image width in pixels")
height: int = Field(..., description="Image height in pixels")
@validator("width", "height")
def dimensions_must_be_positive(cls, v):
if v <= 0:
raise ValueError("Dimensions must be positive")
return v
class ParserMetadata(BaseModel):
"""Metadata about the parsing process."""
image_size: Tuple[int, int] = Field(
..., description="Original image dimensions (width, height)"
)
num_icons: int = Field(..., description="Number of icons detected")
num_text: int = Field(..., description="Number of text elements detected")
device: str = Field(..., description="Device used for detection (cpu/cuda/mps)")
ocr_enabled: bool = Field(..., description="Whether OCR was enabled")
latency: float = Field(..., description="Total processing time in seconds")
@property
def width(self) -> int:
"""Get image width from image_size."""
return self.image_size[0]
@property
def height(self) -> int:
"""Get image height from image_size."""
return self.image_size[1]
class ParseResult(BaseModel):
"""Result of parsing a UI screenshot."""
elements: List[UIElement] = Field(..., description="Detected UI elements")
annotated_image_base64: str = Field(..., description="Base64 encoded annotated image")
metadata: ParserMetadata = Field(..., description="Processing metadata")
screen_info: Optional[List[str]] = Field(
None, description="Human-readable descriptions of elements"
)
parsed_content_list: Optional[List[Dict[str, Any]]] = Field(
None, description="Parsed elements as dictionaries"
)
@property
def image(self) -> ImageData:
"""Get image data as a convenience property."""
return ImageData(
base64=self.annotated_image_base64,
width=self.metadata.width,
height=self.metadata.height,
)
@property
def width(self) -> int:
"""Get image width from metadata."""
return self.metadata.width
@property
def height(self) -> int:
"""Get image height from metadata."""
return self.metadata.height
def model_dump(self) -> Dict[str, Any]:
"""Convert model to dict for compatibility with older code."""
result = super().model_dump()
# Add image data dict for backward compatibility
result["image"] = self.image.model_dump()
return result