From 60e4fe80376b513984f6ffe8be91cb19aff4cac7 Mon Sep 17 00:00:00 2001
From: FrederikBaerentsen <frederik+gitea@baerentsen.net>
Date: Fri, 5 Dec 2025 23:51:09 +0100
Subject: [PATCH] fix(inst): removed cloudscraper as it caused issues with
 rebrickable instructions

---
 CHANGELOG.md                 |  8 +++++
 bricktracker/instructions.py | 62 ++++++++++++++++++++++++++----------
 bricktracker/peeron_pdf.py   | 13 +++++---
 3 files changed, 62 insertions(+), 21 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b17bab8..94f279b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -141,6 +141,14 @@ See [Migration Guide](docs/migration_guide.md) for detailed instructions
   - Automatic fallback to nil.png from parts folder for set previews
   - Copy of nil placeholder saved as set image for consistent display across all routes
   - Prevents errors when downloading sets that have no set_img_url in API response
+- Fix instructions download from Rebrickable
+  - Replaced cloudscraper with standard requests library
+  - Resolves 403 Forbidden errors when downloading instruction PDFs
+- Fix instructions display and URL generation
+  - Fixed "Open PDF" button links to use correct data route
+  - Corrected path resolution for data/instructions folder
+  - Fixed instruction listing page to scan correct folder location
+  - Fixed Peeron PDF creation to use correct data folder path
     
 ### 1.2.4
 
diff --git a/bricktracker/instructions.py b/bricktracker/instructions.py
index 3f9ff4e..a2ffb53 100644
--- a/bricktracker/instructions.py
+++ b/bricktracker/instructions.py
@@ -13,7 +13,6 @@ import requests
 from werkzeug.datastructures import FileStorage
 from werkzeug.utils import secure_filename
 import re
-import cloudscraper
 
 from .exceptions import ErrorException, DownloadException
 if TYPE_CHECKING:
@@ -106,20 +105,34 @@ class BrickInstructions(object):
                     message=f'File {self.filename} already exists, skipped - <a href="{pdf_url}" target="_blank" class="btn btn-sm btn-primary ms-2"><i class="ri-external-link-line"></i> Open PDF</a>'
                 )
 
-            # Fetch PDF via cloudscraper (to bypass Cloudflare)
-            scraper = cloudscraper.create_scraper()
-            scraper.headers.update({
-                "User-Agent": current_app.config['REBRICKABLE_USER_AGENT']
+            # Use plain requests instead of cloudscraper
+            session = requests.Session()
+            session.headers.update({
+                'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'],
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'DNT': '1',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'same-origin',
+                'Cache-Control': 'max-age=0'
             })
 
-            # Visit the instructions page first to establish session cookies
+            # Visit the set's instructions listing page first to establish session cookies
+            set_number = None
             if self.rebrickable:
-                instructions_page = f"https://rebrickable.com/instructions/{self.rebrickable.fields.set}/"
-                scraper.get(instructions_page)
-                # Set referer to the instructions page we just visited
-                scraper.headers.update({"Referer": instructions_page})
+                set_number = self.rebrickable.fields.set
+            elif self.set:
+                set_number = self.set
 
-            resp = scraper.get(path, stream=True, allow_redirects=True)
+            if set_number:
+                instructions_page = f"https://rebrickable.com/instructions/{set_number}/"
+                session.get(instructions_page)
+                session.headers.update({"Referer": instructions_page})
+
+            resp = session.get(path, stream=True, allow_redirects=True)
             if not resp.ok:
                 raise DownloadException(f"Failed to download: HTTP {resp.status_code}")
 
@@ -256,20 +269,33 @@ class BrickInstructions(object):
     @staticmethod
     def find_instructions(set: str, /) -> list[Tuple[str, str]]:
         """
-        Scrape Rebrickable’s HTML and return a list of
+        Scrape Rebrickable's HTML and return a list of
         (filename_slug, download_url). Duplicate slugs get _1, _2, …
         """
         page_url = f"https://rebrickable.com/instructions/{set}/"
         logger.debug(f"[find_instructions] fetching HTML from {page_url!r}")
 
-        # Solve Cloudflare’s challenge
-        scraper = cloudscraper.create_scraper()
-        scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
-        resp = scraper.get(page_url)
+        # Use plain requests instead of cloudscraper
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'],
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Cache-Control': 'max-age=0'
+        })
+
+        resp = session.get(page_url)
         if not resp.ok:
             raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}')
 
         soup = BeautifulSoup(resp.content, 'html.parser')
+        # Match download links with or without query parameters (e.g., ?cfe=timestamp&cfk=key)
         link_re = re.compile(r'^/instructions/\d+/.+/download/')
 
         raw: list[tuple[str, str]] = []
@@ -282,8 +308,10 @@ class BrickInstructions(object):
             alt_text = img['alt'].removeprefix('LEGO Building Instructions for ') # type: ignore
             slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-')
 
-            # Build the absolute download URL
+            # Build the absolute download URL - this preserves query parameters
+            # BeautifulSoup's a['href'] includes the full href with ?cfe=...&cfk=... params
             download_url = urljoin('https://rebrickable.com', a['href']) # type: ignore
+            logger.debug(f"[find_instructions] Found download link: {download_url}")
             raw.append((slug, download_url))
 
         if not raw:
diff --git a/bricktracker/peeron_pdf.py b/bricktracker/peeron_pdf.py
index 5db9070..bc577c1 100644
--- a/bricktracker/peeron_pdf.py
+++ b/bricktracker/peeron_pdf.py
@@ -188,10 +188,15 @@ class PeeronPDF(object):
     # Get target file path
     def _get_target_path(self, /) -> str:
         """Get the full path where the PDF should be saved"""
-        instructions_folder = os.path.join(
-            current_app.static_folder,  # type: ignore
-            current_app.config['INSTRUCTIONS_FOLDER']
-        )
+        folder = current_app.config['INSTRUCTIONS_FOLDER']
+
+        # If folder is absolute, use it directly
+        # Otherwise, make it relative to app root (not static folder)
+        if os.path.isabs(folder):
+            instructions_folder = folder
+        else:
+            instructions_folder = os.path.join(current_app.root_path, folder)
+
         return os.path.join(instructions_folder, self.filename)
 
     # Create BrickInstructions instance for the generated PDF