fix(inst): removed cloudscraper as it caused issues with rebrickable instructions

This commit is contained in:
FrederikBaerentsen
2025-12-05 23:51:09 +01:00
parent 85728e2d68
commit 60e4fe8037
3 changed files with 62 additions and 21 deletions

View File

@@ -141,6 +141,14 @@ See [Migration Guide](docs/migration_guide.md) for detailed instructions
- Automatic fallback to nil.png from parts folder for set previews
- Copy of nil placeholder saved as set image for consistent display across all routes
- Prevents errors when downloading sets that have no set_img_url in API response
- Fix instructions download from Rebrickable
- Replaced cloudscraper with standard requests library
- Resolves 403 Forbidden errors when downloading instruction PDFs
- Fix instructions display and URL generation
- Fixed "Open PDF" button links to use correct data route
- Corrected path resolution for data/instructions folder
- Fixed instruction listing page to scan correct folder location
- Fixed Peeron PDF creation to use correct data folder path
### 1.2.4

View File

@@ -13,7 +13,6 @@ import requests
from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename
import re
import cloudscraper
from .exceptions import ErrorException, DownloadException
if TYPE_CHECKING:
@@ -106,20 +105,34 @@ class BrickInstructions(object):
message=f'File {self.filename} already exists, skipped - <a href="{pdf_url}" target="_blank" class="btn btn-sm btn-primary ms-2"><i class="ri-external-link-line"></i> Open PDF</a>'
)
# Fetch PDF via cloudscraper (to bypass Cloudflare)
scraper = cloudscraper.create_scraper()
scraper.headers.update({
"User-Agent": current_app.config['REBRICKABLE_USER_AGENT']
# Use plain requests instead of cloudscraper
session = requests.Session()
session.headers.update({
'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Cache-Control': 'max-age=0'
})
# Visit the instructions page first to establish session cookies
# Visit the set's instructions listing page first to establish session cookies
set_number = None
if self.rebrickable:
instructions_page = f"https://rebrickable.com/instructions/{self.rebrickable.fields.set}/"
scraper.get(instructions_page)
# Set referer to the instructions page we just visited
scraper.headers.update({"Referer": instructions_page})
set_number = self.rebrickable.fields.set
elif self.set:
set_number = self.set
resp = scraper.get(path, stream=True, allow_redirects=True)
if set_number:
instructions_page = f"https://rebrickable.com/instructions/{set_number}/"
session.get(instructions_page)
session.headers.update({"Referer": instructions_page})
resp = session.get(path, stream=True, allow_redirects=True)
if not resp.ok:
raise DownloadException(f"Failed to download: HTTP {resp.status_code}")
@@ -256,20 +269,33 @@ class BrickInstructions(object):
@staticmethod
def find_instructions(set: str, /) -> list[Tuple[str, str]]:
"""
Scrape Rebrickables HTML and return a list of
Scrape Rebrickable's HTML and return a list of
(filename_slug, download_url). Duplicate slugs get _1, _2, …
"""
page_url = f"https://rebrickable.com/instructions/{set}/"
logger.debug(f"[find_instructions] fetching HTML from {page_url!r}")
# Solve Cloudflares challenge
scraper = cloudscraper.create_scraper()
scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
resp = scraper.get(page_url)
# Use plain requests instead of cloudscraper
session = requests.Session()
session.headers.update({
'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0'
})
resp = session.get(page_url)
if not resp.ok:
raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}')
soup = BeautifulSoup(resp.content, 'html.parser')
# Match download links with or without query parameters (e.g., ?cfe=timestamp&cfk=key)
link_re = re.compile(r'^/instructions/\d+/.+/download/')
raw: list[tuple[str, str]] = []
@@ -282,8 +308,10 @@ class BrickInstructions(object):
alt_text = img['alt'].removeprefix('LEGO Building Instructions for ') # type: ignore
slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-')
# Build the absolute download URL
# Build the absolute download URL - this preserves query parameters
# BeautifulSoup's a['href'] includes the full href with ?cfe=...&cfk=... params
download_url = urljoin('https://rebrickable.com', a['href']) # type: ignore
logger.debug(f"[find_instructions] Found download link: {download_url}")
raw.append((slug, download_url))
if not raw:

View File

@@ -188,10 +188,15 @@ class PeeronPDF(object):
# Get target file path
def _get_target_path(self, /) -> str:
"""Get the full path where the PDF should be saved"""
instructions_folder = os.path.join(
current_app.static_folder, # type: ignore
current_app.config['INSTRUCTIONS_FOLDER']
)
folder = current_app.config['INSTRUCTIONS_FOLDER']
# If folder is absolute, use it directly
# Otherwise, make it relative to app root (not static folder)
if os.path.isabs(folder):
instructions_folder = folder
else:
instructions_folder = os.path.join(current_app.root_path, folder)
return os.path.join(instructions_folder, self.filename)
# Create BrickInstructions instance for the generated PDF