fix(inst): removed cloudscraper as it caused issues with rebrickable instructions

2025-12-30 13:19:59 -06:00 · 2025-12-05 23:51:09 +01:00
parent 85728e2d68
commit 60e4fe8037
3 changed files with 62 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -141,6 +141,14 @@ See [Migration Guide](docs/migration_guide.md) for detailed instructions
  - Automatic fallback to nil.png from parts folder for set previews
  - Copy of nil placeholder saved as set image for consistent display across all routes
  - Prevents errors when downloading sets that have no set_img_url in API response
+- Fix instructions download from Rebrickable
+  - Replaced cloudscraper with standard requests library
+  - Resolves 403 Forbidden errors when downloading instruction PDFs
+- Fix instructions display and URL generation
+  - Fixed "Open PDF" button links to use correct data route
+  - Corrected path resolution for data/instructions folder
+  - Fixed instruction listing page to scan correct folder location
+  - Fixed Peeron PDF creation to use correct data folder path
    
 ### 1.2.4

--- a/bricktracker/instructions.py
+++ b/bricktracker/instructions.py
@@ -13,7 +13,6 @@ import requests
 from werkzeug.datastructures import FileStorage
 from werkzeug.utils import secure_filename
 import re
-import cloudscraper

 from .exceptions import ErrorException, DownloadException
 if TYPE_CHECKING:
@@ -106,20 +105,34 @@ class BrickInstructions(object):
                    message=f'File {self.filename} already exists, skipped - <a href="{pdf_url}" target="_blank" class="btn btn-sm btn-primary ms-2"><i class="ri-external-link-line"></i> Open PDF</a>'
                )

-            # Fetch PDF via cloudscraper (to bypass Cloudflare)
-            scraper = cloudscraper.create_scraper()
-            scraper.headers.update({
-                "User-Agent": current_app.config['REBRICKABLE_USER_AGENT']
+            # Use plain requests instead of cloudscraper
+            session = requests.Session()
+            session.headers.update({
+                'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'],
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'DNT': '1',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'same-origin',
+                'Cache-Control': 'max-age=0'
            })

-            # Visit the instructions page first to establish session cookies
+            # Visit the set's instructions listing page first to establish session cookies
+            set_number = None
            if self.rebrickable:
-                instructions_page = f"https://rebrickable.com/instructions/{self.rebrickable.fields.set}/"
-                scraper.get(instructions_page)
-                # Set referer to the instructions page we just visited
-                scraper.headers.update({"Referer": instructions_page})
+                set_number = self.rebrickable.fields.set
+            elif self.set:
+                set_number = self.set

-            resp = scraper.get(path, stream=True, allow_redirects=True)
+            if set_number:
+                instructions_page = f"https://rebrickable.com/instructions/{set_number}/"
+                session.get(instructions_page)
+                session.headers.update({"Referer": instructions_page})
+
+            resp = session.get(path, stream=True, allow_redirects=True)
            if not resp.ok:
                raise DownloadException(f"Failed to download: HTTP {resp.status_code}")

@@ -256,20 +269,33 @@ class BrickInstructions(object):
    @staticmethod
    def find_instructions(set: str, /) -> list[Tuple[str, str]]:
        """
-        Scrape Rebrickable’s HTML and return a list of
+        Scrape Rebrickable's HTML and return a list of
        (filename_slug, download_url). Duplicate slugs get _1, _2, …
        """
        page_url = f"https://rebrickable.com/instructions/{set}/"
        logger.debug(f"[find_instructions] fetching HTML from {page_url!r}")

-        # Solve Cloudflare’s challenge
-        scraper = cloudscraper.create_scraper()
-        scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
-        resp = scraper.get(page_url)
+        # Use plain requests instead of cloudscraper
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'],
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Cache-Control': 'max-age=0'
+        })
+
+        resp = session.get(page_url)
        if not resp.ok:
            raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}')

        soup = BeautifulSoup(resp.content, 'html.parser')
+        # Match download links with or without query parameters (e.g., ?cfe=timestamp&cfk=key)
        link_re = re.compile(r'^/instructions/\d+/.+/download/')

        raw: list[tuple[str, str]] = []
@@ -282,8 +308,10 @@ class BrickInstructions(object):
            alt_text = img['alt'].removeprefix('LEGO Building Instructions for ') # type: ignore
            slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-')

-            # Build the absolute download URL
+            # Build the absolute download URL - this preserves query parameters
+            # BeautifulSoup's a['href'] includes the full href with ?cfe=...&cfk=... params
            download_url = urljoin('https://rebrickable.com', a['href']) # type: ignore
+            logger.debug(f"[find_instructions] Found download link: {download_url}")
            raw.append((slug, download_url))

        if not raw:
--- a/bricktracker/peeron_pdf.py
+++ b/bricktracker/peeron_pdf.py
@@ -188,10 +188,15 @@ class PeeronPDF(object):
    # Get target file path
    def _get_target_path(self, /) -> str:
        """Get the full path where the PDF should be saved"""
-        instructions_folder = os.path.join(
-            current_app.static_folder,  # type: ignore
-            current_app.config['INSTRUCTIONS_FOLDER']
-        )
+        folder = current_app.config['INSTRUCTIONS_FOLDER']
+
+        # If folder is absolute, use it directly
+        # Otherwise, make it relative to app root (not static folder)
+        if os.path.isabs(folder):
+            instructions_folder = folder
+        else:
+            instructions_folder = os.path.join(current_app.root_path, folder)
+
        return os.path.join(instructions_folder, self.filename)

    # Create BrickInstructions instance for the generated PDF