From 68aa2fe0b8d889dfde5ca8793a71d59479029fb7 Mon Sep 17 00:00:00 2001
From: Bavisetti Narayan <72156168+NarayanBavisetti@users.noreply.github.com>
Date: Thu, 23 Oct 2025 00:29:05 +0530
Subject: [PATCH] [WIKI-553] chore: improved pages components tracking (#7966)

* chore: page components tracking

* chore: changed the transaction task

* chore: added logger for description html
---
 apps/api/plane/app/views/page/base.py         |  25 ++-
 .../plane/bgtasks/page_transaction_task.py    | 148 +++++++++++++-----
 apps/api/plane/utils/content_validator.py     |   9 +-
 3 files changed, 129 insertions(+), 53 deletions(-)
diff --git a/apps/api/plane/app/views/page/base.py b/apps/api/plane/app/views/page/base.py
index 72fb4ef8ee..b8946d22a6 100644
--- a/apps/api/plane/app/views/page/base.py
+++ b/apps/api/plane/app/views/page/base.py
@@ -137,7 +137,11 @@ class PageViewSet(BaseViewSet):
         if serializer.is_valid():
             serializer.save()
             # capture the page transaction
-            page_transaction.delay(request.data, None, serializer.data["id"])
+            page_transaction.delay(
+                new_description_html=request.data.get("description_html", "<p></p>"),
+                old_description_html=None,
+                page_id=serializer.data["id"],
+            )
             page = self.get_queryset().get(pk=serializer.data["id"])
             serializer = PageDetailSerializer(page)
             return Response(serializer.data, status=status.HTTP_201_CREATED)
@@ -168,11 +172,8 @@ class PageViewSet(BaseViewSet):
                 # capture the page transaction
                 if request.data.get("description_html"):
                     page_transaction.delay(
-                        new_value=request.data,
-                        old_value=json.dumps(
-                            {"description_html": page_description},
-                            cls=DjangoJSONEncoder,
-                        ),
+                        new_description_html=request.data.get("description_html", "<p></p>"),
+                        old_description_html=page_description,
                         page_id=page_id,
                     )
 
@@ -504,7 +505,11 @@ class PagesDescriptionViewSet(BaseViewSet):
         if serializer.is_valid():
             # Capture the page transaction
             if request.data.get("description_html"):
-                page_transaction.delay(new_value=request.data, old_value=existing_instance, page_id=page_id)
+                page_transaction.delay(
+                    new_description_html=request.data.get("description_html", "<p></p>"),
+                    old_description_html=page.description_html,
+                    page_id=page_id,
+                )
 
             # Update the page using serializer
             updated_page = serializer.save()
@@ -550,7 +555,11 @@ class PageDuplicateEndpoint(BaseAPIView):
                 updated_by_id=page.updated_by_id,
             )
 
-        page_transaction.delay({"description_html": page.description_html}, None, page.id)
+        page_transaction.delay(
+            new_description_html=page.description_html,
+            old_description_html=None,
+            page_id=page.id,
+        )
 
         # Copy the s3 objects uploaded in the page
         copy_s3_objects_of_description_and_assets.delay(
diff --git a/apps/api/plane/bgtasks/page_transaction_task.py b/apps/api/plane/bgtasks/page_transaction_task.py
index 09e2cb2add..402d0a3ee0 100644
--- a/apps/api/plane/bgtasks/page_transaction_task.py
+++ b/apps/api/plane/bgtasks/page_transaction_task.py
@@ -1,5 +1,5 @@
 # Python imports
-import json
+import logging
 
 # Django imports
 from django.utils import timezone
@@ -7,72 +7,134 @@ from django.utils import timezone
 # Third-party imports
 from bs4 import BeautifulSoup
 
-# Module imports
-from plane.db.models import Page, PageLog
+# App imports
 from celery import shared_task
+from plane.db.models import Page, PageLog
 from plane.utils.exception_logger import log_exception
 
+logger = logging.getLogger("plane.worker")
 
-def extract_components(value, tag):
+COMPONENT_MAP = {
+    "mention-component": {
+        "attributes": ["id", "entity_identifier", "entity_name", "entity_type"],
+        "extract": lambda m: {
+            "entity_name": m.get("entity_name"),
+            "entity_type": None,
+            "entity_identifier": m.get("entity_identifier"),
+        },
+    },
+    "image-component": {
+        "attributes": ["id", "src"],
+        "extract": lambda m: {
+            "entity_name": "image",
+            "entity_type": None,
+            "entity_identifier": m.get("src"),
+        },
+    },
+}
+
+component_map = {
+    **COMPONENT_MAP,
+}
+
+
+def extract_all_components(description_html):
+    """
+    Extracts all component types from the HTML value in a single pass.
+    Returns a dict mapping component_type -> list of extracted entities.
+    """
     try:
-        mentions = []
-        html = value.get("description_html")
-        soup = BeautifulSoup(html, "html.parser")
-        mention_tags = soup.find_all(tag)
+        if not description_html:
+            return {component: [] for component in component_map.keys()}
 
-        for mention_tag in mention_tags:
-            mention = {
-                "id": mention_tag.get("id"),
-                "entity_identifier": mention_tag.get("entity_identifier"),
-                "entity_name": mention_tag.get("entity_name"),
-            }
-            mentions.append(mention)
+        soup = BeautifulSoup(description_html, "html.parser")
+        results = {}
+
+        for component, config in component_map.items():
+            attributes = config.get("attributes", ["id"])
+            component_tags = soup.find_all(component)
+
+            entities = []
+            for tag in component_tags:
+                entity = {attr: tag.get(attr) for attr in attributes}
+                entities.append(entity)
+
+            results[component] = entities
+
+        return results
 
-        return mentions
     except Exception:
-        return []
+        return {component: [] for component in component_map.keys()}
+
+
+def get_entity_details(component: str, mention: dict):
+    """
+    Normalizes mention attributes into entity_name, entity_type, entity_identifier.
+    """
+    config = component_map.get(component)
+    if not config:
+        return {"entity_name": None, "entity_type": None, "entity_identifier": None}
+    return config["extract"](mention)
 
 
 @shared_task
-def page_transaction(new_value, old_value, page_id):
+def page_transaction(new_description_html, old_description_html, page_id):
+    """
+    Tracks changes in page content (mentions, embeds, etc.)
+    and logs them in PageLog for audit and reference.
+    """
     try:
         page = Page.objects.get(pk=page_id)
-        new_page_mention = PageLog.objects.filter(page_id=page_id).exists()
 
-        old_value = json.loads(old_value) if old_value else {}
+        has_existing_logs = PageLog.objects.filter(page_id=page_id).exists()
+
+
+        # Extract all components in a single pass (optimized)
+        old_components = extract_all_components(old_description_html)
+        new_components = extract_all_components(new_description_html)
 
         new_transactions = []
         deleted_transaction_ids = set()
 
-        # TODO - Add "issue-embed-component", "img", "todo" components
-        components = ["mention-component"]
-        for component in components:
-            old_mentions = extract_components(old_value, component)
-            new_mentions = extract_components(new_value, component)
+        for component in component_map.keys():
+            old_entities = old_components[component]
+            new_entities = new_components[component]
 
-            new_mentions_ids = {mention["id"] for mention in new_mentions}
-            old_mention_ids = {mention["id"] for mention in old_mentions}
-            deleted_transaction_ids.update(old_mention_ids - new_mentions_ids)
+            old_ids = {m.get("id") for m in old_entities if m.get("id")}
+            new_ids = {m.get("id") for m in new_entities if m.get("id")}
+            deleted_transaction_ids.update(old_ids - new_ids)
 
-            new_transactions.extend(
-                PageLog(
-                    transaction=mention["id"],
-                    page_id=page_id,
-                    entity_identifier=mention["entity_identifier"],
-                    entity_name=mention["entity_name"],
-                    workspace_id=page.workspace_id,
-                    created_at=timezone.now(),
-                    updated_at=timezone.now(),
+            for mention in new_entities:
+                mention_id = mention.get("id")
+                if not mention_id or (mention_id in old_ids and has_existing_logs):
+                    continue
+
+                details = get_entity_details(component, mention)
+                current_time = timezone.now()
+
+                new_transactions.append(
+                    PageLog(
+                        transaction=mention_id,
+                        page_id=page_id,
+                        entity_identifier=details["entity_identifier"],
+                        entity_name=details["entity_name"],
+                        entity_type=details["entity_type"],
+                        workspace_id=page.workspace_id,
+                        created_at=current_time,
+                        updated_at=current_time,
+                    )
                 )
-                for mention in new_mentions
-                if mention["id"] not in old_mention_ids or not new_page_mention
+
+
+        # Bulk insert and cleanup
+        if new_transactions:
+            PageLog.objects.bulk_create(
+                new_transactions, batch_size=50, ignore_conflicts=True
             )
 
-        # Create new PageLog objects for new transactions
-        PageLog.objects.bulk_create(new_transactions, batch_size=10, ignore_conflicts=True)
+        if deleted_transaction_ids:
+            PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete()
 
-        # Delete the removed transactions
-        PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete()
     except Page.DoesNotExist:
         return
     except Exception as e:
diff --git a/apps/api/plane/utils/content_validator.py b/apps/api/plane/utils/content_validator.py
index 5163fad7dd..ff06a562fa 100644
--- a/apps/api/plane/utils/content_validator.py
+++ b/apps/api/plane/utils/content_validator.py
@@ -4,7 +4,9 @@ import nh3
 from plane.utils.exception_logger import log_exception
 from bs4 import BeautifulSoup
 from collections import defaultdict
+import logging
 
+logger = logging.getLogger("plane.api")
 
 # Maximum allowed size for binary data (10MB)
 MAX_SIZE = 10 * 1024 * 1024
@@ -54,7 +56,9 @@ def validate_binary_data(data):
     # Check for suspicious text patterns (HTML/JS)
     try:
         decoded_text = binary_data.decode("utf-8", errors="ignore")[:200]
-        if any(pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS):
+        if any(
+            pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS
+        ):
             return False, "Binary data contains suspicious content patterns"
     except Exception:
         pass  # Binary data might not be decodable as text, which is fine
@@ -232,8 +236,9 @@ def validate_html_content(html_content: str):
                 summary = json.dumps(diff)
             except Exception:
                 summary = str(diff)
+            logger.warning(f"HTML sanitization removals: {summary}")
             log_exception(
-                f"HTML sanitization removals: {summary}",
+                ValueError(f"HTML sanitization removals: {summary}"),
                 warning=True,
             )
         return True, None, clean_html