From 60e4fe80376b513984f6ffe8be91cb19aff4cac7 Mon Sep 17 00:00:00 2001 From: FrederikBaerentsen Date: Fri, 5 Dec 2025 23:51:09 +0100 Subject: [PATCH] fix(inst): removed cloudscraper as it caused issues with rebrickable instructions --- CHANGELOG.md | 8 +++++ bricktracker/instructions.py | 62 ++++++++++++++++++++++++++---------- bricktracker/peeron_pdf.py | 13 +++++--- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b17bab8..94f279b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -141,6 +141,14 @@ See [Migration Guide](docs/migration_guide.md) for detailed instructions - Automatic fallback to nil.png from parts folder for set previews - Copy of nil placeholder saved as set image for consistent display across all routes - Prevents errors when downloading sets that have no set_img_url in API response +- Fix instructions download from Rebrickable + - Replaced cloudscraper with standard requests library + - Resolves 403 Forbidden errors when downloading instruction PDFs +- Fix instructions display and URL generation + - Fixed "Open PDF" button links to use correct data route + - Corrected path resolution for data/instructions folder + - Fixed instruction listing page to scan correct folder location + - Fixed Peeron PDF creation to use correct data folder path ### 1.2.4 diff --git a/bricktracker/instructions.py b/bricktracker/instructions.py index 3f9ff4e..a2ffb53 100644 --- a/bricktracker/instructions.py +++ b/bricktracker/instructions.py @@ -13,7 +13,6 @@ import requests from werkzeug.datastructures import FileStorage from werkzeug.utils import secure_filename import re -import cloudscraper from .exceptions import ErrorException, DownloadException if TYPE_CHECKING: @@ -106,20 +105,34 @@ class BrickInstructions(object): message=f'File {self.filename} already exists, skipped - Open PDF' ) - # Fetch PDF via cloudscraper (to bypass Cloudflare) - scraper = cloudscraper.create_scraper() - scraper.headers.update({ - "User-Agent": current_app.config['REBRICKABLE_USER_AGENT'] + # Use plain requests instead of cloudscraper + session = requests.Session() + session.headers.update({ + 'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'], + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'same-origin', + 'Cache-Control': 'max-age=0' }) - # Visit the instructions page first to establish session cookies + # Visit the set's instructions listing page first to establish session cookies + set_number = None if self.rebrickable: - instructions_page = f"https://rebrickable.com/instructions/{self.rebrickable.fields.set}/" - scraper.get(instructions_page) - # Set referer to the instructions page we just visited - scraper.headers.update({"Referer": instructions_page}) + set_number = self.rebrickable.fields.set + elif self.set: + set_number = self.set - resp = scraper.get(path, stream=True, allow_redirects=True) + if set_number: + instructions_page = f"https://rebrickable.com/instructions/{set_number}/" + session.get(instructions_page) + session.headers.update({"Referer": instructions_page}) + + resp = session.get(path, stream=True, allow_redirects=True) if not resp.ok: raise DownloadException(f"Failed to download: HTTP {resp.status_code}") @@ -256,20 +269,33 @@ class BrickInstructions(object): @staticmethod def find_instructions(set: str, /) -> list[Tuple[str, str]]: """ - Scrape Rebrickable’s HTML and return a list of + Scrape Rebrickable's HTML and return a list of (filename_slug, download_url). Duplicate slugs get _1, _2, … """ page_url = f"https://rebrickable.com/instructions/{set}/" logger.debug(f"[find_instructions] fetching HTML from {page_url!r}") - # Solve Cloudflare’s challenge - scraper = cloudscraper.create_scraper() - scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']}) - resp = scraper.get(page_url) + # Use plain requests instead of cloudscraper + session = requests.Session() + session.headers.update({ + 'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'], + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0' + }) + + resp = session.get(page_url) if not resp.ok: raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}') soup = BeautifulSoup(resp.content, 'html.parser') + # Match download links with or without query parameters (e.g., ?cfe=timestamp&cfk=key) link_re = re.compile(r'^/instructions/\d+/.+/download/') raw: list[tuple[str, str]] = [] @@ -282,8 +308,10 @@ class BrickInstructions(object): alt_text = img['alt'].removeprefix('LEGO Building Instructions for ') # type: ignore slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-') - # Build the absolute download URL + # Build the absolute download URL - this preserves query parameters + # BeautifulSoup's a['href'] includes the full href with ?cfe=...&cfk=... params download_url = urljoin('https://rebrickable.com', a['href']) # type: ignore + logger.debug(f"[find_instructions] Found download link: {download_url}") raw.append((slug, download_url)) if not raw: diff --git a/bricktracker/peeron_pdf.py b/bricktracker/peeron_pdf.py index 5db9070..bc577c1 100644 --- a/bricktracker/peeron_pdf.py +++ b/bricktracker/peeron_pdf.py @@ -188,10 +188,15 @@ class PeeronPDF(object): # Get target file path def _get_target_path(self, /) -> str: """Get the full path where the PDF should be saved""" - instructions_folder = os.path.join( - current_app.static_folder, # type: ignore - current_app.config['INSTRUCTIONS_FOLDER'] - ) + folder = current_app.config['INSTRUCTIONS_FOLDER'] + + # If folder is absolute, use it directly + # Otherwise, make it relative to app root (not static folder) + if os.path.isabs(folder): + instructions_folder = folder + else: + instructions_folder = os.path.join(current_app.root_path, folder) + return os.path.join(instructions_folder, self.filename) # Create BrickInstructions instance for the generated PDF