Merge pull request #75 from markbeep/indexer-abstraction

Indexer abstraction
This commit is contained in:
Mark
2025-03-21 18:09:23 +01:00
committed by GitHub
18 changed files with 669 additions and 34 deletions

View File

@@ -0,0 +1,59 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, TypeVar
from aiohttp import ClientSession
from pydantic import BaseModel
from sqlmodel import Session
from app.internal.indexers.configuration import Configurations
from app.internal.models import BookRequest, ProwlarrSource
class SessionContainer(BaseModel, arbitrary_types_allowed=True):
session: Session
client_session: ClientSession
T = TypeVar("T", bound=Configurations)
class AbstractIndexer(ABC, Generic[T]):
name: str
@abstractmethod
async def setup(
self,
request: BookRequest,
container: SessionContainer,
configurations: Any,
) -> None:
"""
Called initially when a book request is made.
Can be used to set up initial settings required
for the indexer or if the indexer only supports
a general search feature, it can be executed in
this step.
"""
pass
@staticmethod
@abstractmethod
async def get_configurations(
container: SessionContainer,
) -> T:
"""
Returns a list of configuration options that will be configurable on the frontend.
"""
pass
@abstractmethod
async def is_matching_source(
self, source: ProwlarrSource, container: SessionContainer
) -> bool:
pass
@abstractmethod
async def edit_source_metadata(
self, source: ProwlarrSource, container: SessionContainer
) -> None:
pass

View File

@@ -0,0 +1,116 @@
import logging
from typing import Any, Generic, Optional, TypeVar
from pydantic import BaseModel
from sqlmodel import Session
from app.util.cache import StringConfigCache
logger = logging.getLogger(__name__)
T = TypeVar("T", str, int, bool, float, None)
class IndexerConfiguration(BaseModel, Generic[T]):
display_name: str
description: Optional[str] = None
default: Optional[T] = None
required: bool = False
type: type[T]
def is_str(self) -> bool:
return self.type is str
def is_float(self) -> bool:
return self.type is float
def is_int(self) -> bool:
return self.type is int
def is_bool(self) -> bool:
return self.type is bool
class Configurations(BaseModel):
"""
The configurations to use for an indexer.
Any fields of type `IndexerConfiguration` will
be passed in as a `ValuedConfigurations` object
to the setup method of the indexer and input
fields will be generated for them on the frontend.
"""
pass
class ValuedConfigurations:
"""
Field names need to be unique across all indexers
and match up with the fields of the `Configurations` object.
"""
pass
class ConfigurationException(ValueError):
pass
class MissingRequiredException(ConfigurationException):
pass
class InvalidTypeException(ConfigurationException):
pass
indexer_configuration_cache = StringConfigCache[str]()
def create_valued_configuration(
config: Configurations,
session: Session,
*,
check_required: bool = True,
) -> ValuedConfigurations:
"""
Using a configuration class, it retrieves the values from
the cache/db and handle setting the default values as well
as raising exceptions for required fields.
"""
valued = ValuedConfigurations()
configurations = vars(config)
for key, _value in configurations.items():
if not isinstance(_value, IndexerConfiguration):
logger.debug("Skipping %s", key)
continue
value: IndexerConfiguration[Any] = _value
config_value = indexer_configuration_cache.get(session, key)
if config_value is None:
config_value = value.default
if check_required and value.required and config_value is None:
raise MissingRequiredException(f"Configuration {key} is required")
if config_value is None:
setattr(valued, key, None)
elif value.type is str:
setattr(valued, key, config_value)
elif value.type is int:
try:
setattr(valued, key, int(config_value))
except ValueError:
raise InvalidTypeException(f"Configuration {key} must be an integer")
elif value.type is float:
try:
setattr(valued, key, float(config_value))
except ValueError:
raise InvalidTypeException(f"Configuration {key} must be a float")
elif value.type is bool:
setattr(valued, key, bool(config_value))
return valued

View File

@@ -0,0 +1,50 @@
import logging
from typing import Any, cast
from pydantic import BaseModel
from app.internal.indexers.indexers import indexers
from app.internal.indexers.abstract import AbstractIndexer, SessionContainer
from app.internal.indexers.configuration import (
ConfigurationException,
IndexerConfiguration,
ValuedConfigurations,
create_valued_configuration,
)
logger = logging.getLogger(__name__)
class IndexerContext(BaseModel, arbitrary_types_allowed=True):
indexer: AbstractIndexer[Any]
configuration: dict[str, IndexerConfiguration[Any]]
valued: ValuedConfigurations
async def get_indexer_contexts(
container: SessionContainer, *, check_required: bool = True
) -> list[IndexerContext]:
contexts: list[IndexerContext] = []
for Indexer in indexers:
try:
configuration = await Indexer.get_configurations(container)
filtered_configuration: dict[str, IndexerConfiguration[Any]] = dict()
for k, v in cast(dict[str, Any], vars(configuration)).items():
if isinstance(v, IndexerConfiguration):
filtered_configuration[k] = v
valued_configuration = create_valued_configuration(
configuration,
container.session,
check_required=check_required,
)
contexts.append(
IndexerContext(
indexer=Indexer(),
configuration=filtered_configuration,
valued=valued_configuration,
)
)
except ConfigurationException as e:
logger.error("Failed to get configurations for %s: %s", Indexer.name, e)
return contexts

View File

@@ -0,0 +1,8 @@
from typing import Any
from app.internal.indexers.abstract import AbstractIndexer
from app.internal.indexers.mam import MamIndexer
indexers: list[type[AbstractIndexer[Any]]] = [
MamIndexer,
]

View File

@@ -0,0 +1,137 @@
import json
import logging
from typing import Any
from urllib.parse import urlencode, urljoin
from app.internal.indexers.abstract import (
AbstractIndexer,
SessionContainer,
)
from app.internal.indexers.configuration import (
Configurations,
IndexerConfiguration,
ValuedConfigurations,
)
from app.internal.models import (
BookRequest,
ProwlarrSource,
)
logger = logging.getLogger(__name__)
class MamConfigurations(Configurations):
mam_session_id: IndexerConfiguration[str] = IndexerConfiguration(
type=str,
display_name="MAM Session ID",
required=True,
)
mam_active: IndexerConfiguration[bool] = IndexerConfiguration(
type=bool,
display_name="MAM Active",
default=True,
)
class ValuedMamConfigurations(ValuedConfigurations):
mam_session_id: str
mam_active: bool
class MamIndexer(AbstractIndexer[MamConfigurations]):
name = "MyAnonamouse"
results: dict[str, dict[str, Any]] = dict()
@staticmethod
async def get_configurations(
container: SessionContainer,
) -> MamConfigurations:
return MamConfigurations()
async def setup(
self,
request: BookRequest,
container: SessionContainer,
configurations: ValuedMamConfigurations,
):
if not configurations.mam_active:
return
query = request.title + " " + " ".join(request.authors)
params: dict[str, Any] = {
"tor[text]": query, # book title + author(s)
"tor[main_cat]": [13], # MAM audiobook category
"tor[searchIn]": "torrents",
"tor[srchIn][author]": "true",
"tor[srchIn][title]": "true",
"tor[searchType]": "active", # only search for torrents with at least 1 seeder.
"startNumber": 0,
"perpage": 100,
}
url = urljoin(
"https://www.myanonamouse.net",
f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}",
)
session_id = configurations.mam_session_id
logger.info("Mam: Querying: %s", url)
async with container.client_session.get(
url, cookies={"mam_id": session_id}
) as response:
if not response.ok:
logger.error("Mam: Failed to query: %s", response.text)
return
search_results = await response.json()
for result in search_results["data"]:
self.results[str(result["id"])] = result
logger.info("Mam: Retrieved %d results", len(self.results))
async def is_matching_source(
self,
source: ProwlarrSource,
container: SessionContainer,
):
return source.info_url is not None and source.info_url.startswith(
"https://www.myanonamouse.net/t/"
)
async def edit_source_metadata(
self,
source: ProwlarrSource,
container: SessionContainer,
):
mam_id = source.guid.split("/")[-1]
result = self.results.get(mam_id)
if result is None:
return
# response type of authors and narrators is a stringified json object
source.book_metadata.authors = list(
json.loads(result.get("author_info", "{}")).values()
)
source.book_metadata.narrators = list(
json.loads(result.get("narrator_info", "{}")).values()
)
indexer_flags: set[str] = set(source.indexer_flags)
if result["personal_freeleech"] == 1:
indexer_flags.add("personal_freeleech")
indexer_flags.add("freeleech")
if result["free"] == 1:
indexer_flags.add("free")
indexer_flags.add("freeleech")
if result["fl_vip"] == 1:
indexer_flags.add("fl_vip")
indexer_flags.add("freeleech")
if result["vip"] == 1:
indexer_flags.add("vip")
source.indexer_flags = list(indexer_flags)
source.book_metadata.filetype = result["filetype"]

View File

@@ -133,6 +133,16 @@ class ManualBookRequest(BaseModel, table=True):
arbitrary_types_allowed = True
class BookMetadata(BaseModel):
"""extra metadata that can be added to sources to better rank them"""
title: Optional[str] = None
subtitle: Optional[str] = None
authors: list[str] = []
narrators: list[str] = []
filetype: Optional[str] = None
class BaseSource(BaseModel):
guid: str
indexer_id: int
@@ -145,6 +155,8 @@ class BaseSource(BaseModel):
download_url: Optional[str] = None
magnet_url: Optional[str] = None
book_metadata: BookMetadata = BookMetadata()
@property
def size_MB(self):
return round(self.size / 1e6, 1)

View File

@@ -1,20 +1,23 @@
import json
import logging
from datetime import datetime
import posixpath
from datetime import datetime
from typing import Any, Literal, Optional
from urllib.parse import urlencode
from aiohttp import ClientResponse, ClientSession
from sqlmodel import Session
from app.internal.indexers.abstract import SessionContainer
from app.internal.models import (
BookRequest,
EventEnum,
ProwlarrSource,
TorrentSource,
UsenetSource,
)
from app.internal.notifications import send_all_notifications
from app.internal.prowlarr.source_metadata import edit_source_metadata
from app.util.cache import SimpleCache, StringConfigCache
logger = logging.getLogger(__name__)
@@ -127,12 +130,11 @@ async def start_download(
async def query_prowlarr(
session: Session,
client_session: ClientSession,
query: Optional[str],
book_request: BookRequest,
indexer_ids: Optional[list[int]] = None,
force_refresh: bool = False,
) -> list[ProwlarrSource]:
if not query:
return []
query = book_request.title
base_url = prowlarr_config.get_base_url(session)
api_key = prowlarr_config.get_api_key(session)
@@ -217,6 +219,10 @@ async def query_prowlarr(
except KeyError as e:
logger.error("Failed to parse source: %s. KeyError: %s", result, e)
# add additional metadata using any available indexers
container = SessionContainer(session=session, client_session=client_session)
await edit_source_metadata(book_request, sources, container)
prowlarr_source_cache.set(sources, query)
return sources

View File

@@ -0,0 +1,39 @@
import asyncio
import logging
from types import CoroutineType
from typing import Any
from app.internal.indexers.abstract import SessionContainer
from app.internal.indexers.indexer_util import get_indexer_contexts
from app.internal.models import BookRequest, ProwlarrSource
logger = logging.getLogger(__name__)
async def edit_source_metadata(
book_request: BookRequest,
sources: list[ProwlarrSource],
container: SessionContainer,
):
contexts = await get_indexer_contexts(container)
coros = [
context.indexer.setup(book_request, container, context.valued)
for context in contexts
]
exceptions = await asyncio.gather(*coros, return_exceptions=True)
for exc in exceptions:
if exc:
logger.error("Failed to setup indexer: %s", exc)
coros: list[CoroutineType[Any, Any, None]] = []
for source in sources:
for context in contexts:
if await context.indexer.is_matching_source(source, container):
coros.append(context.indexer.edit_source_metadata(source, container))
break
exceptions = await asyncio.gather(*coros, return_exceptions=True)
for exc in exceptions:
if exc:
logger.error("Failed to edit source metadata: %s", exc)

View File

@@ -53,15 +53,12 @@ async def query_sources(
if not book:
raise HTTPException(status_code=500, detail="Book asin error")
query = book.title + " " + " ".join(book.authors)
sources = await query_prowlarr(
session,
client_session,
query,
book,
force_refresh=force_refresh,
)
ranked = await rank_sources(session, client_session, sources, book)
# start download if requested

View File

@@ -178,15 +178,29 @@ class CompareSource:
return int(b_title) - int(a_title)
def _compare_authors(self, a: RankSource, b: RankSource, next_compare: int) -> int:
a_score = vaguely_exist_in_title(
self.book.authors,
a.source.title,
quality_config.get_name_exists_ratio(self.session),
a_score = max(
vaguely_exist_in_title(
self.book.authors,
a.source.title,
quality_config.get_name_exists_ratio(self.session),
),
fuzzy_author_narrator_match(
a.source.book_metadata.authors,
self.book.authors,
quality_config.get_name_exists_ratio(self.session),
),
)
b_score = vaguely_exist_in_title(
self.book.authors,
b.source.title,
quality_config.get_name_exists_ratio(self.session),
b_score = max(
vaguely_exist_in_title(
self.book.authors,
b.source.title,
quality_config.get_name_exists_ratio(self.session),
),
fuzzy_author_narrator_match(
b.source.book_metadata.authors,
self.book.authors,
quality_config.get_name_exists_ratio(self.session),
),
)
if a_score == b_score:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
@@ -195,15 +209,29 @@ class CompareSource:
def _compare_narrators(
self, a: RankSource, b: RankSource, next_compare: int
) -> int:
a_score = vaguely_exist_in_title(
self.book.narrators,
a.source.title,
quality_config.get_name_exists_ratio(self.session),
a_score = max(
vaguely_exist_in_title(
self.book.narrators,
a.source.title,
quality_config.get_name_exists_ratio(self.session),
),
fuzzy_author_narrator_match(
a.source.book_metadata.narrators,
self.book.narrators,
quality_config.get_name_exists_ratio(self.session),
),
)
b_score = vaguely_exist_in_title(
self.book.narrators,
b.source.title,
quality_config.get_name_exists_ratio(self.session),
b_score = max(
vaguely_exist_in_title(
self.book.narrators,
b.source.title,
quality_config.get_name_exists_ratio(self.session),
),
fuzzy_author_narrator_match(
b.source.book_metadata.narrators,
self.book.narrators,
quality_config.get_name_exists_ratio(self.session),
),
)
if a_score == b_score:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
@@ -226,6 +254,28 @@ class CompareSource:
return int((b.source.publish_date - a.source.publish_date).total_seconds())
def fuzzy_author_narrator_match(
source_people: list[str], book_people: list[str], name_exists_ratio: int
) -> int:
"""Calculate a fuzzy matching score between two lists of author/narrator names."""
if not source_people or not book_people:
return 0
score = 0
for book_person in book_people:
best_match = 0
for source_person in source_people:
match_score = fuzz.token_set_ratio(
book_person, source_person, processor=utils.default_process
)
best_match = max(best_match, match_score)
# Only count matches above threshold
if best_match > name_exists_ratio:
score += 1
return score
def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int) -> int:
return sum(
1

View File

@@ -16,6 +16,9 @@ from app.internal.auth.authentication import (
from app.internal.auth.config import LoginTypeEnum, auth_config
from app.internal.auth.oidc_config import oidc_config
from app.internal.env_settings import Settings
from app.internal.indexers.abstract import SessionContainer
from app.internal.indexers.configuration import indexer_configuration_cache
from app.internal.indexers.indexer_util import IndexerContext, get_indexer_contexts
from app.internal.models import EventEnum, GroupEnum, Notification, User
from app.internal.notifications import send_notification
from app.internal.prowlarr.indexer_categories import indexer_categories
@@ -703,3 +706,67 @@ async def update_security(
block_name="form",
headers={} if old == login_type else {"HX-Refresh": "true"},
)
@router.get("/indexers")
async def read_indexers(
request: Request,
admin_user: Annotated[
DetailedUser, Depends(get_authenticated_user(GroupEnum.admin))
],
session: Annotated[Session, Depends(get_session)],
client_session: Annotated[ClientSession, Depends(get_connection)],
):
contexts = await get_indexer_contexts(
SessionContainer(session=session, client_session=client_session),
check_required=False,
)
return template_response(
"settings_page/indexers.html",
request,
admin_user,
{
"page": "indexers",
"indexers": contexts,
"version": Settings().app.version,
},
)
@router.post("/indexers")
async def update_indexers(
request: Request,
admin_user: Annotated[
DetailedUser, Depends(get_authenticated_user(GroupEnum.admin))
],
indexer_select: Annotated[str, Form()],
session: Annotated[Session, Depends(get_session)],
client_session: Annotated[ClientSession, Depends(get_connection)],
):
contexts = await get_indexer_contexts(
SessionContainer(session=session, client_session=client_session),
check_required=False,
)
updated_context: Optional[IndexerContext] = None
for context in contexts:
if context.indexer.name == indexer_select:
updated_context = context
break
if not updated_context:
raise ToastException("Indexer not found", "error")
form_values = await request.form()
for key, value in form_values.items():
if key in updated_context.configuration and type(value) is str:
if updated_context.configuration[key].type is bool:
indexer_configuration_cache.set(
session, key, "true" if value == "on" else ""
)
else:
indexer_configuration_cache.set(session, key, str(value))
raise ToastException("Indexers updated", "success")

View File

@@ -10,6 +10,8 @@ from app.internal.auth.authentication import DetailedUser
templates = Jinja2Blocks(directory="templates")
templates.env.filters["quote_plus"] = lambda u: quote_plus(u) # pyright: ignore[reportUnknownLambdaType,reportUnknownMemberType,reportUnknownArgumentType]
templates.env.filters["zfill"] = lambda val, num: str(val).zfill(num) # pyright: ignore[reportUnknownLambdaType,reportUnknownMemberType,reportUnknownArgumentType]
templates.env.globals["vars"] = vars # pyright: ignore[reportUnknownMemberType]
templates.env.globals["getattr"] = getattr # pyright: ignore[reportUnknownMemberType]
@overload

View File

@@ -0,0 +1,4 @@
<script
defer
src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
></script>

View File

@@ -55,6 +55,13 @@
class="tab {% if page=='security' %}tab-active{% endif %}"
>Security</a
>
<a
preload
href="/settings/indexers"
role="tab"
class="tab {% if page=='indexers' %}tab-active{% endif %}"
>Indexers</a
>
</div>
{% endif %} {% block content %} {% endblock %}
</main>

View File

@@ -0,0 +1,83 @@
{% extends "settings_page/base.html" %} {% block head %}
<title>Settings - Indexers</title>
{% include 'scripts/alpinejs.html' %} {% endblock %} {% block content %}
<main x-data="{ selected: '{{ indexers[0].indexer.name }}' }">
<form
id="change-password-form"
class="flex flex-col gap-2"
hx-post="/settings/indexers"
hx-target="this"
>
<h2 class="text-lg">Indexer Settings</h2>
<p class="text-sm opacity-60">
Extra indexer settings that add additional metadata to sources.
</p>
<label for="indexer-select">Indexer</label>
<select
id="indexer-select"
name="indexer_select"
class="select w-full"
x-model="selected"
>
{% for ctxt in indexers %}
<option value="{{ ctxt.indexer.name }}">{{ ctxt.indexer.name }}</option>
{% endfor %}
</select>
{% for ctxt in indexers %}
<template x-if="selected === '{{ ctxt.indexer.name }}'">
<div class="contents">
{% for key, config in ctxt.configuration.items() %}
<div>
<label for="{{ key }}"
>{{ config.display_name }} {% if config.required %}<span
class="text-error"
>*</span
>{% endif %}</label
>
{% if config.description %}
<p class="text-xs opacity-60">{{ config.description }}</p>
{% endif %}
</div>
{% if config.is_str() %}
<!-- prettier-ignore -->
<input
id="{{ key }}"
name="{{ key }}"
type="text"
class="input w-full"
value="{{ getattr(ctxt.valued, key) or "" }}"
{% if config.required %}required{% endif %}
/>
{% elif config.is_int() or config.is_float() %}
<!-- prettier-ignore -->
<input
id="{{ key }}"
name="{{ key }}"
type="number"
class="input w-full"
value="{{ getattr(ctxt.valued, key) or "" }}"
{% if config.required %}required{% endif %}
/>
{% elif config.is_bool() %}
<!-- prettier-ignore -->
<input
id="{{ key }}"
name="{{ key }}"
type="checkbox"
class="checkbox"
{% if getattr(ctxt.valued, key) %} checked {% endif %}
/>
{% endif %} {% endfor %}
</div>
</template>
{% endfor %}
<button name="submit" class="btn btn-primary" type="submit">Save</button>
</form>
</main>
{% endblock %}

View File

@@ -1,10 +1,6 @@
{% extends "settings_page/base.html" %} {% block head %}
<title>Settings - Prowlarr</title>
<script
defer
src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
></script>
{% endblock %} {% block content %}
{% include 'scripts/alpinejs.html' %} {% endblock %} {% block content %}
<div class="flex flex-col gap-2">
<h2 class="text-lg">Prowlarr</h2>

View File

@@ -1,10 +1,6 @@
{% extends "settings_page/base.html" %} {% block head %}
<title>Settings - Security</title>
<script
defer
src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
></script>
{% endblock %} {% block content %}
{% include 'scripts/alpinejs.html' %} {% endblock %} {% block content %}
<div class="flex flex-col">
{% block form %}
<form

View File

@@ -28,6 +28,8 @@
<tr>
<th></th>
<th>title</th>
<th>author(s)</th>
<th>narrator(s)</th>
<th>indexer</th>
<th>flags</th>
<th>seed / leech (or grabs)</th>
@@ -48,6 +50,10 @@
>{{ source.title }}</a
>{% else %}{{ source.title }}{% endif %}
</td>
<!-- prettier-ignore -->
<td>{% if source.book_metadata.authors %}{{ source.book_metadata.authors|join(', ') }}{% endif %}</td>
<!-- prettier-ignore -->
<td>{% if source.book_metadata.narrators %}{{ source.book_metadata.narrators|join(', ') }}{% endif %}</td>
<td>{{ source.indexer }}</td>
<td>{{ source.indexer_flags|join(', ') }}</td>
{% if source.protocol == "torrent" %}