mirror of
https://github.com/markbeep/AudioBookRequest.git
synced 2026-05-12 12:08:25 -05:00
better heuristic ranking of sources
This commit is contained in:
@@ -73,7 +73,7 @@ tailwindcss -i styles/globals.css -o static/globals.css --watch --m
|
||||
3. Start browser-sync. This hot reloads the website when the html template files are modified:
|
||||
|
||||
```sh
|
||||
browser-sync http://localhost:8000 --files templates/**
|
||||
browser-sync http://localhost:8000 --files templates/** --files app/**
|
||||
```
|
||||
|
||||
**NOTE**: Website has to be visited at http://localhost:3000 instead.
|
||||
|
||||
@@ -133,7 +133,7 @@ async def query_prowlarr(
|
||||
|
||||
params: dict[str, Any] = {
|
||||
"query": query,
|
||||
"categories": 3000,
|
||||
"categories": 3030, # Audio/Audiobook
|
||||
"type": "search",
|
||||
"limit": 100,
|
||||
"offset": 0,
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
import asyncio
|
||||
from aiohttp import ClientSession
|
||||
from functools import cmp_to_key
|
||||
from typing import Callable
|
||||
|
||||
import pydantic
|
||||
from aiohttp import ClientSession
|
||||
from rapidfuzz import fuzz, utils
|
||||
from sqlmodel import Session
|
||||
|
||||
from app.models import BookRequest, ProwlarrSource
|
||||
from app.util.ranking.quality import QualityProfile
|
||||
from app.util.ranking.quality_extract import Quality, extract_qualities
|
||||
from functools import cmp_to_key
|
||||
|
||||
|
||||
class RankSource(pydantic.BaseModel):
|
||||
@@ -30,7 +34,6 @@ async def rank_sources(
|
||||
rank_sources = [x for y in await asyncio.gather(*coros) for x in y]
|
||||
|
||||
compare = CompareSource(quality_profile, book)
|
||||
# TODO: check if the ordering is working as expected
|
||||
rank_sources.sort(key=cmp_to_key(compare))
|
||||
|
||||
return [rs.source for rs in rank_sources]
|
||||
@@ -40,13 +43,33 @@ class CompareSource:
|
||||
def __init__(self, quality_profile: QualityProfile, book: BookRequest):
|
||||
self.book = book
|
||||
self.quality_profile = quality_profile
|
||||
self.compare_order = [
|
||||
self._compare_valid,
|
||||
self._compare_title,
|
||||
self._compare_authors,
|
||||
self._compare_narrators,
|
||||
self._compare_format,
|
||||
self._compare_flags,
|
||||
self._compare_indexer,
|
||||
self._compare_subtitle,
|
||||
self._compare_seeders,
|
||||
]
|
||||
|
||||
def __call__(self, a: RankSource, b: RankSource):
|
||||
return self.compare(a, b)
|
||||
|
||||
def compare(self, a: RankSource, b: RankSource) -> int:
|
||||
# TODO: allow customizing of order
|
||||
return self._compare_quality(a, b)
|
||||
return self._get_next_compare(0)(a, b, 1)
|
||||
|
||||
def _get_next_compare(
|
||||
self, index: int
|
||||
) -> Callable[[RankSource, RankSource, int], int]:
|
||||
def default_compare(a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
return 0
|
||||
|
||||
if index < len(self.compare_order):
|
||||
return self.compare_order[index]
|
||||
return default_compare
|
||||
|
||||
def _is_valid_quality(self, a: RankSource) -> bool:
|
||||
match a.quality.file_format:
|
||||
@@ -56,71 +79,115 @@ class CompareSource:
|
||||
quality_range = self.quality_profile.M4B
|
||||
case "mp3":
|
||||
quality_range = self.quality_profile.MP3
|
||||
case "unknown-audio":
|
||||
quality_range = self.quality_profile.UNKNOWN_AUDIO
|
||||
case "unknown":
|
||||
quality_range = self.quality_profile.UNKNOWN
|
||||
|
||||
return quality_range[0] < a.quality.kbits < quality_range[1]
|
||||
|
||||
def _compare_quality(self, a: RankSource, b: RankSource) -> int:
|
||||
a_valid = self._is_valid_quality(a)
|
||||
b_valid = self._is_valid_quality(b)
|
||||
if a_valid and not b_valid:
|
||||
return -1
|
||||
if not a_valid and b_valid:
|
||||
return 1
|
||||
return self._compare_format(a, b)
|
||||
def _compare_valid(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
"""Filter out any reasons that make it not valid"""
|
||||
a_valid = self._is_valid_quality(a) and a.source.seeders > 0
|
||||
b_valid = self._is_valid_quality(b) and b.source.seeders > 0
|
||||
if a_valid == b_valid:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return int(b_valid) - int(a_valid)
|
||||
|
||||
def _compare_format(self, a: RankSource, b: RankSource) -> int:
|
||||
def _compare_format(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
if a.quality.file_format == b.quality.file_format:
|
||||
return self._compare_flags(a, b)
|
||||
a_index = self.quality_profile.get_quality_rank(a.quality.file_format)
|
||||
b_index = self.quality_profile.get_quality_rank(b.quality.file_format)
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
a_index = self.quality_profile.calculate_quality_rank(a.quality.file_format)
|
||||
b_index = self.quality_profile.calculate_quality_rank(b.quality.file_format)
|
||||
return a_index - b_index
|
||||
|
||||
def _compare_flags(self, a: RankSource, b: RankSource) -> int:
|
||||
def _compare_flags(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
a_score = sum(
|
||||
points
|
||||
for flag, points in self.quality_profile.flags
|
||||
for flag, points in self.quality_profile.indexer_flags
|
||||
if flag.lower() in a.source.indexer_flags
|
||||
)
|
||||
b_score = sum(
|
||||
points
|
||||
for flag, points in self.quality_profile.flags
|
||||
for flag, points in self.quality_profile.indexer_flags
|
||||
if flag.lower() in b.source.indexer_flags
|
||||
)
|
||||
if a_score == b_score:
|
||||
return self._compare_indexer(a, b)
|
||||
return a_score - b_score
|
||||
|
||||
def _compare_indexer(self, a: RankSource, b: RankSource) -> int:
|
||||
a_index = self.quality_profile.get_indexer_rank(a.source.indexer_id)
|
||||
b_index = self.quality_profile.get_indexer_rank(b.source.indexer_id)
|
||||
if a_index == b_index:
|
||||
return self._compare_by_name(a, b)
|
||||
return a_index - b_index
|
||||
|
||||
def _compare_by_name(self, a: RankSource, b: RankSource) -> int:
|
||||
a_score = add_scores(self.book, a)
|
||||
b_score = add_scores(self.book, b)
|
||||
if a_score == b_score:
|
||||
return self._compare_seeders(a, b)
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return b_score - a_score
|
||||
|
||||
def _compare_seeders(self, a: RankSource, b: RankSource) -> int:
|
||||
def _compare_indexer(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
a_index = self.quality_profile.calculate_indexer_rank(a.source.indexer_id)
|
||||
b_index = self.quality_profile.calculate_indexer_rank(b.source.indexer_id)
|
||||
if a_index == b_index:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return a_index - b_index
|
||||
|
||||
def _compare_title(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
a_title = exists_in_title(
|
||||
self.book.title, a.source.title, self.quality_profile.title_exists_ratio
|
||||
)
|
||||
b_title = exists_in_title(
|
||||
self.book.title, b.source.title, self.quality_profile.title_exists_ratio
|
||||
)
|
||||
if a_title == b_title:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return int(b_title) - int(a_title)
|
||||
|
||||
def _compare_subtitle(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
if not self.book.subtitle:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
a_title = exists_in_title(
|
||||
self.book.subtitle, a.source.title, self.quality_profile.title_exists_ratio
|
||||
)
|
||||
b_title = exists_in_title(
|
||||
self.book.subtitle, b.source.title, self.quality_profile.title_exists_ratio
|
||||
)
|
||||
if a_title == b_title:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return int(b_title) - int(a_title)
|
||||
|
||||
def _compare_authors(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
a_score = vaguely_exist_in_title(
|
||||
self.book.authors, a.source.title, self.quality_profile.name_exists_ratio
|
||||
)
|
||||
b_score = vaguely_exist_in_title(
|
||||
self.book.authors, b.source.title, self.quality_profile.name_exists_ratio
|
||||
)
|
||||
if a_score == b_score:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return b_score - a_score
|
||||
|
||||
def _compare_narrators(
|
||||
self, a: RankSource, b: RankSource, next_compare: int
|
||||
) -> int:
|
||||
a_score = vaguely_exist_in_title(
|
||||
self.book.narrators, a.source.title, self.quality_profile.name_exists_ratio
|
||||
)
|
||||
b_score = vaguely_exist_in_title(
|
||||
self.book.narrators, b.source.title, self.quality_profile.name_exists_ratio
|
||||
)
|
||||
if a_score == b_score:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return b_score - a_score
|
||||
|
||||
def _compare_seeders(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
if a.source.seeders == b.source.seeders:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return b.source.seeders - a.source.seeders
|
||||
|
||||
|
||||
def add_scores(book: BookRequest, a: RankSource) -> int:
|
||||
score = 0
|
||||
if book.title not in a.source.title:
|
||||
score -= 100
|
||||
def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int) -> int:
|
||||
return sum(
|
||||
1
|
||||
for w in words
|
||||
if fuzz.token_set_ratio(w, title, processor=utils.default_process)
|
||||
> name_exists_ratio
|
||||
)
|
||||
|
||||
for author in book.authors:
|
||||
if author in a.source.title:
|
||||
score += 10
|
||||
|
||||
for narrator in book.narrators:
|
||||
if narrator not in book.authors and narrator in a.source.title:
|
||||
score += 20
|
||||
|
||||
return score
|
||||
def exists_in_title(word: str, title: str, title_exists_ratio: int) -> bool:
|
||||
return (
|
||||
fuzz.partial_ratio(word, title, processor=utils.default_process)
|
||||
> title_exists_ratio
|
||||
)
|
||||
|
||||
+13
-12
@@ -1,31 +1,32 @@
|
||||
import math
|
||||
from typing import Literal
|
||||
import pydantic
|
||||
|
||||
FileFormat = Literal["flac", "m4b", "mp3", "unknown"]
|
||||
|
||||
FileFormat = Literal["flac", "m4b", "mp3", "unknown-audio", "unknown"]
|
||||
|
||||
|
||||
class QualityProfile(pydantic.BaseModel):
|
||||
FLAC: tuple[float, float] = (0, math.inf)
|
||||
M4B: tuple[float, float] = (0, math.inf)
|
||||
MP3: tuple[float, float] = (0, math.inf)
|
||||
UNKNOWN: tuple[float, float] = (0, math.inf)
|
||||
FLAC: tuple[float, float] = (20.0, 400.0)
|
||||
M4B: tuple[float, float] = (20.0, 400.0)
|
||||
MP3: tuple[float, float] = (20.0, 400.0)
|
||||
UNKNOWN_AUDIO: tuple[float, float] = (20.0, 400.0)
|
||||
UNKNOWN: tuple[float, float] = (20.0, 400.0)
|
||||
|
||||
flags: list[tuple[str, int]] = []
|
||||
|
||||
format_order: list[FileFormat] = ["flac", "m4b", "mp3", "unknown"]
|
||||
indexer_flags: list[tuple[str, int]] = []
|
||||
format_order: list[FileFormat] = ["flac", "m4b", "mp3", "unknown-audio", "unknown"]
|
||||
"""Order of file formats from highest to lowest quality"""
|
||||
|
||||
indexer_order: list[int] = []
|
||||
"""Order of indexers from highest to lowest quality"""
|
||||
name_exists_ratio: int = 75
|
||||
title_exists_ratio: int = 90
|
||||
|
||||
def get_quality_rank(self, file_format: FileFormat) -> int:
|
||||
def calculate_quality_rank(self, file_format: FileFormat) -> int:
|
||||
try:
|
||||
return self.format_order.index(file_format)
|
||||
except ValueError:
|
||||
return len(self.format_order)
|
||||
|
||||
def get_indexer_rank(self, indexer_id: int) -> int:
|
||||
def calculate_indexer_rank(self, indexer_id: int) -> int:
|
||||
try:
|
||||
return self.indexer_order.index(indexer_id)
|
||||
except ValueError:
|
||||
|
||||
@@ -12,6 +12,10 @@ import os
|
||||
from app.models import BookRequest, ProwlarrSource
|
||||
from app.util.ranking.quality import FileFormat
|
||||
|
||||
# HACK: Disabled because it doesn't work well with ratelimiting
|
||||
# We instead completely rely on the title and size of the complete torrent
|
||||
ENABLE_TORRENT_INSPECTION = False
|
||||
|
||||
|
||||
class Quality(pydantic.BaseModel):
|
||||
kbits: float
|
||||
@@ -77,9 +81,11 @@ async def extract_qualities(
|
||||
raise ValueError("Prowlarr API key not set")
|
||||
|
||||
book_seconds = book.runtime_length_min * 60
|
||||
if book_seconds == 0:
|
||||
return []
|
||||
|
||||
data = None
|
||||
if source.download_url:
|
||||
if source.download_url and ENABLE_TORRENT_INSPECTION:
|
||||
try:
|
||||
for _ in range(3):
|
||||
async with client_session.get(
|
||||
@@ -93,45 +99,63 @@ async def extract_qualities(
|
||||
else:
|
||||
return []
|
||||
except aiohttp.NonHttpUrlRedirectClientError as e:
|
||||
print(e.args) # tuple. first element is a magnet link
|
||||
return []
|
||||
source.magnet_url = e.args[0]
|
||||
source.download_url = None
|
||||
|
||||
if not data:
|
||||
return []
|
||||
# TODO: correctly fix wrong torrent parsing
|
||||
parsed = tp.decode(data, hash_fields={"pieces": (1, False)})
|
||||
actual_sizes: dict[FileFormat, int] = defaultdict(int)
|
||||
file_formats = set()
|
||||
if "info" not in parsed or "files" not in parsed["info"]:
|
||||
return []
|
||||
for f in parsed["info"]["files"]:
|
||||
size: int = f["length"]
|
||||
path: str = f["path"][-1]
|
||||
_, ext = os.path.splitext(path)
|
||||
ext = ext.lower()
|
||||
if ext == ".flac":
|
||||
file_formats.add("flac")
|
||||
actual_sizes["flac"] += size
|
||||
elif ext == ".m4b":
|
||||
file_formats.add("m4b")
|
||||
actual_sizes["m4b"] += size
|
||||
elif ext == ".mp3":
|
||||
file_formats.add("mp3")
|
||||
actual_sizes["mp3"] += size
|
||||
elif ext in audio_file_formats:
|
||||
file_formats.add("unknown")
|
||||
actual_sizes["unknown"] += size
|
||||
|
||||
qualities = []
|
||||
for k, v in actual_sizes.items():
|
||||
qualities.append(
|
||||
Quality(
|
||||
kbits=v / (book_seconds * 60) / 1024 * 8,
|
||||
file_format=k,
|
||||
)
|
||||
)
|
||||
return qualities
|
||||
if data:
|
||||
return get_torrent_info(data, book_seconds)
|
||||
|
||||
# TODO: use the magnet url to fetch the file information
|
||||
|
||||
return []
|
||||
file_format: FileFormat = "unknown"
|
||||
if "mp3" in source.title.lower():
|
||||
file_format = "mp3"
|
||||
elif "flac" in source.title.lower():
|
||||
file_format = "flac"
|
||||
elif "m4b" in source.title.lower():
|
||||
file_format = "m4b"
|
||||
elif "audiobook" in source.title.lower():
|
||||
file_format = "unknown-audio"
|
||||
|
||||
return [
|
||||
Quality(kbits=8 * source.size / book_seconds / 1000, file_format=file_format)
|
||||
]
|
||||
|
||||
|
||||
def get_torrent_info(data: bytes, book_seconds: int) -> list[Quality]:
|
||||
try:
|
||||
# TODO: correctly fix wrong torrent parsing
|
||||
parsed = tp.decode(data, hash_fields={"pieces": (1, False)})
|
||||
except tp.InvalidTorrentDataException:
|
||||
return []
|
||||
actual_sizes: dict[FileFormat, int] = defaultdict(int)
|
||||
file_formats = set()
|
||||
if "info" not in parsed or "files" not in parsed["info"]:
|
||||
return []
|
||||
for f in parsed["info"]["files"]:
|
||||
size: int = f["length"]
|
||||
path: str = f["path"][-1]
|
||||
_, ext = os.path.splitext(path)
|
||||
ext = ext.lower()
|
||||
if ext == ".flac":
|
||||
file_formats.add("flac")
|
||||
actual_sizes["flac"] += size
|
||||
elif ext == ".m4b":
|
||||
file_formats.add("m4b")
|
||||
actual_sizes["m4b"] += size
|
||||
elif ext == ".mp3":
|
||||
file_formats.add("mp3")
|
||||
actual_sizes["mp3"] += size
|
||||
elif ext in audio_file_formats:
|
||||
file_formats.add("unknown")
|
||||
actual_sizes["unknown"] += size
|
||||
|
||||
qualities = []
|
||||
for k, v in actual_sizes.items():
|
||||
qualities.append(
|
||||
Quality(
|
||||
kbits=8 * v / book_seconds / 1000,
|
||||
file_format=k,
|
||||
)
|
||||
)
|
||||
return qualities
|
||||
|
||||
@@ -45,6 +45,7 @@ pytailwindcss==0.2.0
|
||||
python-dotenv==1.0.1
|
||||
python-multipart==0.0.20
|
||||
PyYAML==6.0.2
|
||||
RapidFuzz==3.12.1
|
||||
rich==13.9.4
|
||||
rich-toolkit==0.13.2
|
||||
shellingham==1.5.4
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
<th></th>
|
||||
<th>title</th>
|
||||
<th>indexer</th>
|
||||
<th>flags</th>
|
||||
<th>seed / leech</th>
|
||||
<th>size (MB)</th>
|
||||
<th>publish date</th>
|
||||
@@ -55,7 +56,7 @@
|
||||
<a href="{{ source.info_url }}" class="link">{{ source.title }}</a>
|
||||
</td>
|
||||
<td>{{ indexers[source.indexer_id].name }}</td>
|
||||
|
||||
<td>{{ source.indexer_flags|join(', ') }}</td>
|
||||
<td>{{ source.seeders }} / {{ source.leechers }}</td>
|
||||
<td>{{ source.size_MB }}</td>
|
||||
<td>{{ source.publish_date.strftime("%d. %b %Y") }}</td>
|
||||
|
||||
Reference in New Issue
Block a user