better heuristic ranking of sources

This commit is contained in:
Markbeep
2025-02-21 12:31:44 +01:00
parent ae771a2e14
commit 42f099374d
7 changed files with 196 additions and 102 deletions
+1 -1
View File
@@ -73,7 +73,7 @@ tailwindcss -i styles/globals.css -o static/globals.css --watch --m
3. Start browser-sync. This hot reloads the website when the html template files are modified:
```sh
browser-sync http://localhost:8000 --files templates/**
browser-sync http://localhost:8000 --files templates/** --files app/**
```
**NOTE**: Website has to be visited at http://localhost:3000 instead.
+1 -1
View File
@@ -133,7 +133,7 @@ async def query_prowlarr(
params: dict[str, Any] = {
"query": query,
"categories": 3000,
"categories": 3030, # Audio/Audiobook
"type": "search",
"limit": 100,
"offset": 0,
+115 -48
View File
@@ -1,11 +1,15 @@
import asyncio
from aiohttp import ClientSession
from functools import cmp_to_key
from typing import Callable
import pydantic
from aiohttp import ClientSession
from rapidfuzz import fuzz, utils
from sqlmodel import Session
from app.models import BookRequest, ProwlarrSource
from app.util.ranking.quality import QualityProfile
from app.util.ranking.quality_extract import Quality, extract_qualities
from functools import cmp_to_key
class RankSource(pydantic.BaseModel):
@@ -30,7 +34,6 @@ async def rank_sources(
rank_sources = [x for y in await asyncio.gather(*coros) for x in y]
compare = CompareSource(quality_profile, book)
# TODO: check if the ordering is working as expected
rank_sources.sort(key=cmp_to_key(compare))
return [rs.source for rs in rank_sources]
@@ -40,13 +43,33 @@ class CompareSource:
def __init__(self, quality_profile: QualityProfile, book: BookRequest):
self.book = book
self.quality_profile = quality_profile
self.compare_order = [
self._compare_valid,
self._compare_title,
self._compare_authors,
self._compare_narrators,
self._compare_format,
self._compare_flags,
self._compare_indexer,
self._compare_subtitle,
self._compare_seeders,
]
def __call__(self, a: RankSource, b: RankSource):
return self.compare(a, b)
def compare(self, a: RankSource, b: RankSource) -> int:
# TODO: allow customizing of order
return self._compare_quality(a, b)
return self._get_next_compare(0)(a, b, 1)
def _get_next_compare(
self, index: int
) -> Callable[[RankSource, RankSource, int], int]:
def default_compare(a: RankSource, b: RankSource, next_compare: int) -> int:
return 0
if index < len(self.compare_order):
return self.compare_order[index]
return default_compare
def _is_valid_quality(self, a: RankSource) -> bool:
match a.quality.file_format:
@@ -56,71 +79,115 @@ class CompareSource:
quality_range = self.quality_profile.M4B
case "mp3":
quality_range = self.quality_profile.MP3
case "unknown-audio":
quality_range = self.quality_profile.UNKNOWN_AUDIO
case "unknown":
quality_range = self.quality_profile.UNKNOWN
return quality_range[0] < a.quality.kbits < quality_range[1]
def _compare_quality(self, a: RankSource, b: RankSource) -> int:
a_valid = self._is_valid_quality(a)
b_valid = self._is_valid_quality(b)
if a_valid and not b_valid:
return -1
if not a_valid and b_valid:
return 1
return self._compare_format(a, b)
def _compare_valid(self, a: RankSource, b: RankSource, next_compare: int) -> int:
"""Filter out any reasons that make it not valid"""
a_valid = self._is_valid_quality(a) and a.source.seeders > 0
b_valid = self._is_valid_quality(b) and b.source.seeders > 0
if a_valid == b_valid:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return int(b_valid) - int(a_valid)
def _compare_format(self, a: RankSource, b: RankSource) -> int:
def _compare_format(self, a: RankSource, b: RankSource, next_compare: int) -> int:
if a.quality.file_format == b.quality.file_format:
return self._compare_flags(a, b)
a_index = self.quality_profile.get_quality_rank(a.quality.file_format)
b_index = self.quality_profile.get_quality_rank(b.quality.file_format)
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
a_index = self.quality_profile.calculate_quality_rank(a.quality.file_format)
b_index = self.quality_profile.calculate_quality_rank(b.quality.file_format)
return a_index - b_index
def _compare_flags(self, a: RankSource, b: RankSource) -> int:
def _compare_flags(self, a: RankSource, b: RankSource, next_compare: int) -> int:
a_score = sum(
points
for flag, points in self.quality_profile.flags
for flag, points in self.quality_profile.indexer_flags
if flag.lower() in a.source.indexer_flags
)
b_score = sum(
points
for flag, points in self.quality_profile.flags
for flag, points in self.quality_profile.indexer_flags
if flag.lower() in b.source.indexer_flags
)
if a_score == b_score:
return self._compare_indexer(a, b)
return a_score - b_score
def _compare_indexer(self, a: RankSource, b: RankSource) -> int:
a_index = self.quality_profile.get_indexer_rank(a.source.indexer_id)
b_index = self.quality_profile.get_indexer_rank(b.source.indexer_id)
if a_index == b_index:
return self._compare_by_name(a, b)
return a_index - b_index
def _compare_by_name(self, a: RankSource, b: RankSource) -> int:
a_score = add_scores(self.book, a)
b_score = add_scores(self.book, b)
if a_score == b_score:
return self._compare_seeders(a, b)
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return b_score - a_score
def _compare_seeders(self, a: RankSource, b: RankSource) -> int:
def _compare_indexer(self, a: RankSource, b: RankSource, next_compare: int) -> int:
a_index = self.quality_profile.calculate_indexer_rank(a.source.indexer_id)
b_index = self.quality_profile.calculate_indexer_rank(b.source.indexer_id)
if a_index == b_index:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return a_index - b_index
def _compare_title(self, a: RankSource, b: RankSource, next_compare: int) -> int:
a_title = exists_in_title(
self.book.title, a.source.title, self.quality_profile.title_exists_ratio
)
b_title = exists_in_title(
self.book.title, b.source.title, self.quality_profile.title_exists_ratio
)
if a_title == b_title:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return int(b_title) - int(a_title)
def _compare_subtitle(self, a: RankSource, b: RankSource, next_compare: int) -> int:
if not self.book.subtitle:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
a_title = exists_in_title(
self.book.subtitle, a.source.title, self.quality_profile.title_exists_ratio
)
b_title = exists_in_title(
self.book.subtitle, b.source.title, self.quality_profile.title_exists_ratio
)
if a_title == b_title:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return int(b_title) - int(a_title)
def _compare_authors(self, a: RankSource, b: RankSource, next_compare: int) -> int:
a_score = vaguely_exist_in_title(
self.book.authors, a.source.title, self.quality_profile.name_exists_ratio
)
b_score = vaguely_exist_in_title(
self.book.authors, b.source.title, self.quality_profile.name_exists_ratio
)
if a_score == b_score:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return b_score - a_score
def _compare_narrators(
self, a: RankSource, b: RankSource, next_compare: int
) -> int:
a_score = vaguely_exist_in_title(
self.book.narrators, a.source.title, self.quality_profile.name_exists_ratio
)
b_score = vaguely_exist_in_title(
self.book.narrators, b.source.title, self.quality_profile.name_exists_ratio
)
if a_score == b_score:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return b_score - a_score
def _compare_seeders(self, a: RankSource, b: RankSource, next_compare: int) -> int:
if a.source.seeders == b.source.seeders:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return b.source.seeders - a.source.seeders
def add_scores(book: BookRequest, a: RankSource) -> int:
score = 0
if book.title not in a.source.title:
score -= 100
def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int) -> int:
return sum(
1
for w in words
if fuzz.token_set_ratio(w, title, processor=utils.default_process)
> name_exists_ratio
)
for author in book.authors:
if author in a.source.title:
score += 10
for narrator in book.narrators:
if narrator not in book.authors and narrator in a.source.title:
score += 20
return score
def exists_in_title(word: str, title: str, title_exists_ratio: int) -> bool:
return (
fuzz.partial_ratio(word, title, processor=utils.default_process)
> title_exists_ratio
)
+13 -12
View File
@@ -1,31 +1,32 @@
import math
from typing import Literal
import pydantic
FileFormat = Literal["flac", "m4b", "mp3", "unknown"]
FileFormat = Literal["flac", "m4b", "mp3", "unknown-audio", "unknown"]
class QualityProfile(pydantic.BaseModel):
FLAC: tuple[float, float] = (0, math.inf)
M4B: tuple[float, float] = (0, math.inf)
MP3: tuple[float, float] = (0, math.inf)
UNKNOWN: tuple[float, float] = (0, math.inf)
FLAC: tuple[float, float] = (20.0, 400.0)
M4B: tuple[float, float] = (20.0, 400.0)
MP3: tuple[float, float] = (20.0, 400.0)
UNKNOWN_AUDIO: tuple[float, float] = (20.0, 400.0)
UNKNOWN: tuple[float, float] = (20.0, 400.0)
flags: list[tuple[str, int]] = []
format_order: list[FileFormat] = ["flac", "m4b", "mp3", "unknown"]
indexer_flags: list[tuple[str, int]] = []
format_order: list[FileFormat] = ["flac", "m4b", "mp3", "unknown-audio", "unknown"]
"""Order of file formats from highest to lowest quality"""
indexer_order: list[int] = []
"""Order of indexers from highest to lowest quality"""
name_exists_ratio: int = 75
title_exists_ratio: int = 90
def get_quality_rank(self, file_format: FileFormat) -> int:
def calculate_quality_rank(self, file_format: FileFormat) -> int:
try:
return self.format_order.index(file_format)
except ValueError:
return len(self.format_order)
def get_indexer_rank(self, indexer_id: int) -> int:
def calculate_indexer_rank(self, indexer_id: int) -> int:
try:
return self.indexer_order.index(indexer_id)
except ValueError:
+63 -39
View File
@@ -12,6 +12,10 @@ import os
from app.models import BookRequest, ProwlarrSource
from app.util.ranking.quality import FileFormat
# HACK: Disabled because it doesn't work well with ratelimiting
# We instead completely rely on the title and size of the complete torrent
ENABLE_TORRENT_INSPECTION = False
class Quality(pydantic.BaseModel):
kbits: float
@@ -77,9 +81,11 @@ async def extract_qualities(
raise ValueError("Prowlarr API key not set")
book_seconds = book.runtime_length_min * 60
if book_seconds == 0:
return []
data = None
if source.download_url:
if source.download_url and ENABLE_TORRENT_INSPECTION:
try:
for _ in range(3):
async with client_session.get(
@@ -93,45 +99,63 @@ async def extract_qualities(
else:
return []
except aiohttp.NonHttpUrlRedirectClientError as e:
print(e.args) # tuple. first element is a magnet link
return []
source.magnet_url = e.args[0]
source.download_url = None
if not data:
return []
# TODO: correctly fix wrong torrent parsing
parsed = tp.decode(data, hash_fields={"pieces": (1, False)})
actual_sizes: dict[FileFormat, int] = defaultdict(int)
file_formats = set()
if "info" not in parsed or "files" not in parsed["info"]:
return []
for f in parsed["info"]["files"]:
size: int = f["length"]
path: str = f["path"][-1]
_, ext = os.path.splitext(path)
ext = ext.lower()
if ext == ".flac":
file_formats.add("flac")
actual_sizes["flac"] += size
elif ext == ".m4b":
file_formats.add("m4b")
actual_sizes["m4b"] += size
elif ext == ".mp3":
file_formats.add("mp3")
actual_sizes["mp3"] += size
elif ext in audio_file_formats:
file_formats.add("unknown")
actual_sizes["unknown"] += size
qualities = []
for k, v in actual_sizes.items():
qualities.append(
Quality(
kbits=v / (book_seconds * 60) / 1024 * 8,
file_format=k,
)
)
return qualities
if data:
return get_torrent_info(data, book_seconds)
# TODO: use the magnet url to fetch the file information
return []
file_format: FileFormat = "unknown"
if "mp3" in source.title.lower():
file_format = "mp3"
elif "flac" in source.title.lower():
file_format = "flac"
elif "m4b" in source.title.lower():
file_format = "m4b"
elif "audiobook" in source.title.lower():
file_format = "unknown-audio"
return [
Quality(kbits=8 * source.size / book_seconds / 1000, file_format=file_format)
]
def get_torrent_info(data: bytes, book_seconds: int) -> list[Quality]:
try:
# TODO: correctly fix wrong torrent parsing
parsed = tp.decode(data, hash_fields={"pieces": (1, False)})
except tp.InvalidTorrentDataException:
return []
actual_sizes: dict[FileFormat, int] = defaultdict(int)
file_formats = set()
if "info" not in parsed or "files" not in parsed["info"]:
return []
for f in parsed["info"]["files"]:
size: int = f["length"]
path: str = f["path"][-1]
_, ext = os.path.splitext(path)
ext = ext.lower()
if ext == ".flac":
file_formats.add("flac")
actual_sizes["flac"] += size
elif ext == ".m4b":
file_formats.add("m4b")
actual_sizes["m4b"] += size
elif ext == ".mp3":
file_formats.add("mp3")
actual_sizes["mp3"] += size
elif ext in audio_file_formats:
file_formats.add("unknown")
actual_sizes["unknown"] += size
qualities = []
for k, v in actual_sizes.items():
qualities.append(
Quality(
kbits=8 * v / book_seconds / 1000,
file_format=k,
)
)
return qualities
+1
View File
@@ -45,6 +45,7 @@ pytailwindcss==0.2.0
python-dotenv==1.0.1
python-multipart==0.0.20
PyYAML==6.0.2
RapidFuzz==3.12.1
rich==13.9.4
rich-toolkit==0.13.2
shellingham==1.5.4
+2 -1
View File
@@ -38,6 +38,7 @@
<th></th>
<th>title</th>
<th>indexer</th>
<th>flags</th>
<th>seed / leech</th>
<th>size (MB)</th>
<th>publish date</th>
@@ -55,7 +56,7 @@
<a href="{{ source.info_url }}" class="link">{{ source.title }}</a>
</td>
<td>{{ indexers[source.indexer_id].name }}</td>
<td>{{ source.indexer_flags|join(', ') }}</td>
<td>{{ source.seeders }} / {{ source.leechers }}</td>
<td>{{ source.size_MB }}</td>
<td>{{ source.publish_date.strftime("%d. %b %Y") }}</td>