From 68aa2fe0b8d889dfde5ca8793a71d59479029fb7 Mon Sep 17 00:00:00 2001 From: Bavisetti Narayan <72156168+NarayanBavisetti@users.noreply.github.com> Date: Thu, 23 Oct 2025 00:29:05 +0530 Subject: [PATCH] [WIKI-553] chore: improved pages components tracking (#7966) * chore: page components tracking * chore: changed the transaction task * chore: added logger for description html --- apps/api/plane/app/views/page/base.py | 25 ++- .../plane/bgtasks/page_transaction_task.py | 148 +++++++++++++----- apps/api/plane/utils/content_validator.py | 9 +- 3 files changed, 129 insertions(+), 53 deletions(-) diff --git a/apps/api/plane/app/views/page/base.py b/apps/api/plane/app/views/page/base.py index 72fb4ef8ee..b8946d22a6 100644 --- a/apps/api/plane/app/views/page/base.py +++ b/apps/api/plane/app/views/page/base.py @@ -137,7 +137,11 @@ class PageViewSet(BaseViewSet): if serializer.is_valid(): serializer.save() # capture the page transaction - page_transaction.delay(request.data, None, serializer.data["id"]) + page_transaction.delay( + new_description_html=request.data.get("description_html", "

"), + old_description_html=None, + page_id=serializer.data["id"], + ) page = self.get_queryset().get(pk=serializer.data["id"]) serializer = PageDetailSerializer(page) return Response(serializer.data, status=status.HTTP_201_CREATED) @@ -168,11 +172,8 @@ class PageViewSet(BaseViewSet): # capture the page transaction if request.data.get("description_html"): page_transaction.delay( - new_value=request.data, - old_value=json.dumps( - {"description_html": page_description}, - cls=DjangoJSONEncoder, - ), + new_description_html=request.data.get("description_html", "

"), + old_description_html=page_description, page_id=page_id, ) @@ -504,7 +505,11 @@ class PagesDescriptionViewSet(BaseViewSet): if serializer.is_valid(): # Capture the page transaction if request.data.get("description_html"): - page_transaction.delay(new_value=request.data, old_value=existing_instance, page_id=page_id) + page_transaction.delay( + new_description_html=request.data.get("description_html", "

"), + old_description_html=page.description_html, + page_id=page_id, + ) # Update the page using serializer updated_page = serializer.save() @@ -550,7 +555,11 @@ class PageDuplicateEndpoint(BaseAPIView): updated_by_id=page.updated_by_id, ) - page_transaction.delay({"description_html": page.description_html}, None, page.id) + page_transaction.delay( + new_description_html=page.description_html, + old_description_html=None, + page_id=page.id, + ) # Copy the s3 objects uploaded in the page copy_s3_objects_of_description_and_assets.delay( diff --git a/apps/api/plane/bgtasks/page_transaction_task.py b/apps/api/plane/bgtasks/page_transaction_task.py index 09e2cb2add..402d0a3ee0 100644 --- a/apps/api/plane/bgtasks/page_transaction_task.py +++ b/apps/api/plane/bgtasks/page_transaction_task.py @@ -1,5 +1,5 @@ # Python imports -import json +import logging # Django imports from django.utils import timezone @@ -7,72 +7,134 @@ from django.utils import timezone # Third-party imports from bs4 import BeautifulSoup -# Module imports -from plane.db.models import Page, PageLog +# App imports from celery import shared_task +from plane.db.models import Page, PageLog from plane.utils.exception_logger import log_exception +logger = logging.getLogger("plane.worker") -def extract_components(value, tag): +COMPONENT_MAP = { + "mention-component": { + "attributes": ["id", "entity_identifier", "entity_name", "entity_type"], + "extract": lambda m: { + "entity_name": m.get("entity_name"), + "entity_type": None, + "entity_identifier": m.get("entity_identifier"), + }, + }, + "image-component": { + "attributes": ["id", "src"], + "extract": lambda m: { + "entity_name": "image", + "entity_type": None, + "entity_identifier": m.get("src"), + }, + }, +} + +component_map = { + **COMPONENT_MAP, +} + + +def extract_all_components(description_html): + """ + Extracts all component types from the HTML value in a single pass. + Returns a dict mapping component_type -> list of extracted entities. + """ try: - mentions = [] - html = value.get("description_html") - soup = BeautifulSoup(html, "html.parser") - mention_tags = soup.find_all(tag) + if not description_html: + return {component: [] for component in component_map.keys()} - for mention_tag in mention_tags: - mention = { - "id": mention_tag.get("id"), - "entity_identifier": mention_tag.get("entity_identifier"), - "entity_name": mention_tag.get("entity_name"), - } - mentions.append(mention) + soup = BeautifulSoup(description_html, "html.parser") + results = {} + + for component, config in component_map.items(): + attributes = config.get("attributes", ["id"]) + component_tags = soup.find_all(component) + + entities = [] + for tag in component_tags: + entity = {attr: tag.get(attr) for attr in attributes} + entities.append(entity) + + results[component] = entities + + return results - return mentions except Exception: - return [] + return {component: [] for component in component_map.keys()} + + +def get_entity_details(component: str, mention: dict): + """ + Normalizes mention attributes into entity_name, entity_type, entity_identifier. + """ + config = component_map.get(component) + if not config: + return {"entity_name": None, "entity_type": None, "entity_identifier": None} + return config["extract"](mention) @shared_task -def page_transaction(new_value, old_value, page_id): +def page_transaction(new_description_html, old_description_html, page_id): + """ + Tracks changes in page content (mentions, embeds, etc.) + and logs them in PageLog for audit and reference. + """ try: page = Page.objects.get(pk=page_id) - new_page_mention = PageLog.objects.filter(page_id=page_id).exists() - old_value = json.loads(old_value) if old_value else {} + has_existing_logs = PageLog.objects.filter(page_id=page_id).exists() + + + # Extract all components in a single pass (optimized) + old_components = extract_all_components(old_description_html) + new_components = extract_all_components(new_description_html) new_transactions = [] deleted_transaction_ids = set() - # TODO - Add "issue-embed-component", "img", "todo" components - components = ["mention-component"] - for component in components: - old_mentions = extract_components(old_value, component) - new_mentions = extract_components(new_value, component) + for component in component_map.keys(): + old_entities = old_components[component] + new_entities = new_components[component] - new_mentions_ids = {mention["id"] for mention in new_mentions} - old_mention_ids = {mention["id"] for mention in old_mentions} - deleted_transaction_ids.update(old_mention_ids - new_mentions_ids) + old_ids = {m.get("id") for m in old_entities if m.get("id")} + new_ids = {m.get("id") for m in new_entities if m.get("id")} + deleted_transaction_ids.update(old_ids - new_ids) - new_transactions.extend( - PageLog( - transaction=mention["id"], - page_id=page_id, - entity_identifier=mention["entity_identifier"], - entity_name=mention["entity_name"], - workspace_id=page.workspace_id, - created_at=timezone.now(), - updated_at=timezone.now(), + for mention in new_entities: + mention_id = mention.get("id") + if not mention_id or (mention_id in old_ids and has_existing_logs): + continue + + details = get_entity_details(component, mention) + current_time = timezone.now() + + new_transactions.append( + PageLog( + transaction=mention_id, + page_id=page_id, + entity_identifier=details["entity_identifier"], + entity_name=details["entity_name"], + entity_type=details["entity_type"], + workspace_id=page.workspace_id, + created_at=current_time, + updated_at=current_time, + ) ) - for mention in new_mentions - if mention["id"] not in old_mention_ids or not new_page_mention + + + # Bulk insert and cleanup + if new_transactions: + PageLog.objects.bulk_create( + new_transactions, batch_size=50, ignore_conflicts=True ) - # Create new PageLog objects for new transactions - PageLog.objects.bulk_create(new_transactions, batch_size=10, ignore_conflicts=True) + if deleted_transaction_ids: + PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete() - # Delete the removed transactions - PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete() except Page.DoesNotExist: return except Exception as e: diff --git a/apps/api/plane/utils/content_validator.py b/apps/api/plane/utils/content_validator.py index 5163fad7dd..ff06a562fa 100644 --- a/apps/api/plane/utils/content_validator.py +++ b/apps/api/plane/utils/content_validator.py @@ -4,7 +4,9 @@ import nh3 from plane.utils.exception_logger import log_exception from bs4 import BeautifulSoup from collections import defaultdict +import logging +logger = logging.getLogger("plane.api") # Maximum allowed size for binary data (10MB) MAX_SIZE = 10 * 1024 * 1024 @@ -54,7 +56,9 @@ def validate_binary_data(data): # Check for suspicious text patterns (HTML/JS) try: decoded_text = binary_data.decode("utf-8", errors="ignore")[:200] - if any(pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS): + if any( + pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS + ): return False, "Binary data contains suspicious content patterns" except Exception: pass # Binary data might not be decodable as text, which is fine @@ -232,8 +236,9 @@ def validate_html_content(html_content: str): summary = json.dumps(diff) except Exception: summary = str(diff) + logger.warning(f"HTML sanitization removals: {summary}") log_exception( - f"HTML sanitization removals: {summary}", + ValueError(f"HTML sanitization removals: {summary}"), warning=True, ) return True, None, clean_html