add fuzzy matching and other suggestions

2026-01-08 06:29:41 -06:00 · 2025-03-11 22:56:16 +01:00
parent 127c989837
commit a8dfd0e7e6
6 changed files with 226 additions and 183 deletions
--- a/app/internal/indexers/mam.py
+++ b/app/internal/indexers/mam.py
@@ -0,0 +1,154 @@
+import json
+import logging
+from datetime import datetime
+from typing import Any, Literal, Optional
+from urllib.parse import urlencode, urljoin
+
+from aiohttp import ClientSession
+from sqlmodel import Session
+
+from app.internal.models import (
+    TorrentSource,
+    ProwlarrSource,
+)
+from app.util.cache import SimpleCache, StringConfigCache
+
+logger = logging.getLogger(__name__)
+
+
+class MamMisconfigured(ValueError):
+    pass
+
+
+MamConfigKey = Literal["mam_session_id", "mam_source_ttl", "mam_active"]
+
+
+class MamConfig(StringConfigCache[MamConfigKey]):
+    def raise_if_invalid(self, session: Session):
+        if not self.get_session_id(session):
+            raise MamMisconfigured("mam_id not set")
+
+    def is_valid(self, session: Session) -> bool:
+        return (
+            self.get_session_id(session) is not None
+            and self.get_session_id(session) != ""
+        )
+
+    def get_session_id(self, session: Session) -> Optional[str]:
+        return self.get(session, "mam_session_id")
+
+    def set_mam_id(self, session: Session, mam_id: str):
+        self.set(session, "mam_session_id", mam_id)
+
+    def get_source_ttl(self, session: Session) -> int:
+        return self.get_int(session, "mam_source_ttl", 24 * 60 * 60)
+
+    def set_source_ttl(self, session: Session, source_ttl: int):
+        self.set_int(session, "mam_source_ttl", source_ttl)
+
+    def is_active(self, session: Session) -> bool:
+        return self.get(session, "mam_active") == "True"
+
+    def set_active(self, session: Session, state: bool):
+        self.set(session, "mam_active", str(state))
+
+
+mam_config = MamConfig()
+mam_source_cache = SimpleCache[dict[str, TorrentSource]]()
+
+
+def flush_Mam_cache():
+    mam_source_cache.flush()
+
+
+# Downloading is still handled via prowlarr.
+
+
+async def query_mam(
+    session: Session,
+    client_session: ClientSession,
+    query: Optional[str],
+    force_refresh: bool = False,
+) -> dict[str, TorrentSource]:
+    if not query:
+        return dict()
+
+    session_id = mam_config.get_session_id(session)
+    assert session_id is not None
+
+    if not force_refresh:
+        source_ttl = mam_config.get_source_ttl(session)
+        cached_sources = mam_source_cache.get(source_ttl, "mam_" + query)
+        if cached_sources:
+            return cached_sources
+    params: dict[str, Any] = {
+        "tor[text]": query,  # book title + author(s)
+        "tor[main_cat]": [13],  # MAM audiobook category
+        "tor[searchIn]": "torrents",
+        "tor[srchIn][author]": "true",
+        "tor[srchIn][title]": "true",
+        "tor[searchType]": "active",  # only search for torrents with at least 1 seeder.
+        "startNumber": 0,
+        "perpage": 100,
+    }
+
+    base_url = "https://www.myanonamouse.net"
+    url = urljoin(
+        base_url, f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}"
+    )
+
+    logger.info("Querying Mam: %s", url)
+
+    async with client_session.get(url, cookies={"mam_id": session_id}) as response:
+        search_results = await response.json()
+    # Storing in dict for faster retrieval by guid
+    sources: dict[str, TorrentSource] = dict()
+
+    for result in search_results["data"]:
+        # TODO reduce to just authors / narrator unless there is a use for the other data.
+        sources.update(
+            {
+                f'https://www.myanonamouse.net/t/{result["id"]}': TorrentSource(
+                    protocol="torrent",
+                    guid=f'https://www.myanonamouse.net/t/{result["id"]}',
+                    indexer_id=-1,  # We don't know MAM's id within prowlarr.
+                    indexer="MyAnonamouse",
+                    title=result["title"],
+                    seeders=result.get("seeders", 0),
+                    leechers=result.get("leechers", 0),
+                    size=-1,
+                    info_url=f'https://www.myanonamouse.net/t/{result["id"]}',
+                    indexer_flags=(
+                        ["freeleech"] if result["personal_freeleech"] == 1 else []
+                    ),  # TODO add differentiate between freeleech and VIP freeleech availible flags in result: [free, fl_vip, personal_freeleech]
+                    publish_date=datetime.fromisoformat(result["added"]),
+                    authors=(
+                        list(json.loads(result["author_info"]).values())
+                        if result["author_info"]
+                        else []
+                    ),
+                    narrators=(
+                        list(json.loads(result["narrator_info"]).values())
+                        if result["narrator_info"]
+                        else []
+                    ),
+                )
+            }
+        )
+
+    mam_source_cache.set(sources, "mam_" + query)
+
+    return sources
+
+
+def inject_mam_metadata(
+    prowlarrData: list[ProwlarrSource], mamData: dict[str, TorrentSource]
+) -> list[ProwlarrSource]:
+    for p in prowlarrData:
+        m = mamData.get(p.guid)
+        if m is None:
+            continue
+        p.authors = m.authors
+        p.narrators = m.narrators
+
+    return prowlarrData
--- a/app/internal/mam/mam.py
+++ b/app/internal/mam/mam.py
@@ -1,147 +0,0 @@
-import json
-import logging
-from datetime import datetime
-from typing import Any, Literal, Optional, Dict
-from urllib.parse import urlencode, urljoin
-
-from aiohttp import ClientSession
-from sqlmodel import Session
-
-from app.internal.models import (
-    TorrentSource,
-    ProwlarrSource,
-)
-from app.util.cache import SimpleCache, StringConfigCache
-
-logger = logging.getLogger(__name__)
-
-
-class MamMisconfigured(ValueError):
-    pass
-
-
-MamConfigKey = Literal[
-    "mam_session_id",
-    "mam_source_ttl",
-    "mam_active"
-]
-
-
-class MamConfig(StringConfigCache[MamConfigKey]):
-    def raise_if_invalid(self, session: Session):
-        if not self.get_session_id(session):
-            raise MamMisconfigured("mam_id not set")
-
-    def is_valid(self, session: Session) -> bool:
-        return (
-            self.get_session_id(session) is not None and self.get_session_id(session)!=""
-        )
-
-    def get_session_id(self, session: Session) -> Optional[str]:
-        return self.get(session, "mam_session_id")
-
-    def set_mam_id(self, session: Session, mam_id: str):
-        self.set(session, "mam_session_id", mam_id)
-    def get_source_ttl(self, session: Session) -> int:
-        return self.get_int(session, "mam_source_ttl", 24 * 60 * 60)
-
-    def set_source_ttl(self, session: Session, source_ttl: int):
-        self.set_int(session, "mam_source_ttl", source_ttl)
-    def is_active(self, session: Session) -> bool:
-        return self.get(session, "mam_active")=="True"
-    def set_active(self, session: Session, state: bool):
-        self.set(session, "mam_active", str(state))
-
-
-mam_config = MamConfig()
-mam_source_cache = SimpleCache[dict[str, TorrentSource]]()
-
-
-def flush_Mam_cache():
-    mam_source_cache.flush()
-
-# Downloading is still handled via prowlarr.
-
-async def query_mam(
-    session: Session,
-    query: Optional[str],
-    force_refresh: bool = False,
-) -> dict[str, TorrentSource]:
-    if not query:
-        return dict()
-    
-    session_id = mam_config.get_session_id(session)
-    assert session_id is not None 
-     
-    if not force_refresh:
-        source_ttl = mam_config.get_source_ttl(session)
-        cached_sources = mam_source_cache.get(source_ttl,"mam_"+query)
-        if cached_sources:
-            return cached_sources
-    params: dict[str, Any] = {
-        "tor[text]": query, # book title + author(s)
-        
-        "tor[main_cat]": [13],
-        "tor[searchIn]": "torrents",
-        'tor[srchIn][author]': 'true',
-        'tor[srchIn][title]': 'true',
-        'tor[searchType]': 'active',
-        "startNumber": 0, 
-        "perpage": 100,
-    }
-
-    base_url = "https://www.myanonamouse.net"
-    url = urljoin(base_url, f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}")
-
-    logger.info("Querying Mam: %s", url)
-    async with ClientSession() as client_session:
-
-        async with client_session.get(
-            url,
-            cookies={"mam_id":session_id}
-        ) as response:
-            search_results =  await response.json()
-    # Storing in dict for faster retrieval by guid
-    sources : Dict[str,TorrentSource] = dict()
-
-    for result in search_results["data"]:
-        # TODO reduce to just authors / narrator unless there is a use for the other data. 
-        sources.update({
-            f'https://www.myanonamouse.net/t/{result["id"]}':
-            TorrentSource(
-                protocol="torrent",
-                guid=f'https://www.myanonamouse.net/t/{result["id"]}',
-                indexer_id=-1, # We don't know MAM's id within prowlarr. 
-                indexer="MyAnonamouse",
-                title=result["title"],
-                seeders=result.get("seeders", 0),
-                leechers=result.get("leechers", 0),
-                size=-1,
-                info_url=f'https://www.myanonamouse.net/t/{result["id"]}',
-                indexer_flags=["freeleech"] if result["personal_freeleech"]==1 else [], # TODO add differentiate between freeleech and VIP freeleech availible flags in result: [free, fl_vip, personal_freeleech]
-                publish_date=datetime.fromisoformat(result["added"]),
-                authors=list(json.loads(result["author_info"]).values() ) if result["author_info"] else [],
-                narrators=list(json.loads(result["narrator_info"]).values()) if result["narrator_info"] else []
-            )
-        }
-        )
-       
-
-    mam_source_cache.set(sources, "mam_" + query)
-
-    return sources
-
-
-def inject_mam_metadata(prowlarrData: list[ProwlarrSource], mamData: Dict[str,TorrentSource]) -> list[ProwlarrSource]:
-    print(mamData)
-    for p in prowlarrData:
-        m =mamData.get(p.guid)
-        if m is None:
-            print("Not found: ", p.title, p.guid)
-            continue
-        p.authors= m.authors
-        p.narrators = m.narrators
-        print(m.authors, m.narrators, p.title)
-
-    return prowlarrData
-
--- a/app/internal/query.py
+++ b/app/internal/query.py
@@ -13,11 +13,7 @@ from app.internal.prowlarr.prowlarr import (
    start_download,
 )

-from app.internal.mam.mam import (
-    mam_config,
-    query_mam,
-    inject_mam_metadata
-)
+from app.internal.indexers.mam import mam_config, query_mam, inject_mam_metadata
 from app.internal.ranking.download_ranking import rank_sources

 querying: set[str] = set()
@@ -72,10 +68,11 @@ async def query_sources(

            mam_sources = await query_mam(
                session,
+                client_session,
                query,
                force_refresh=force_refresh,
            )
-            sources = inject_mam_metadata(prowlarrData=sources,mamData=mam_sources)
+            sources = inject_mam_metadata(prowlarrData=sources, mamData=mam_sources)

        ranked = await rank_sources(session, client_session, sources, book)

--- a/app/internal/ranking/download_ranking.py
+++ b/app/internal/ranking/download_ranking.py
@@ -10,7 +10,7 @@ from sqlmodel import Session
 from app.internal.models import BookRequest, ProwlarrSource
 from app.internal.ranking.quality import quality_config
 from app.internal.ranking.quality_extract import Quality, extract_qualities
-from app.internal.mam.mam import mam_config
+

 class RankSource(pydantic.BaseModel):
    source: ProwlarrSource
@@ -178,22 +178,30 @@ class CompareSource:
        return int(b_title) - int(a_title)

    def _compare_authors(self, a: RankSource, b: RankSource, next_compare: int) -> int:
-        if(mam_config.is_active(self.session) and (a.source.authors!=[] or b.source.authors!=[])):
-            a_score =  get_intersection_length(a.source.authors, self.book.authors)
-            b_score =  get_intersection_length(b.source.authors, self.book.authors)
-        else:
-
-
-            a_score = vaguely_exist_in_title(
+        a_score = max(
+            vaguely_exist_in_title(
                self.book.authors,
                a.source.title,
                quality_config.get_name_exists_ratio(self.session),
-            )
-            b_score = vaguely_exist_in_title(
+            ),
+            fuzzy_author_narrator_match(
+                a.source.authors,
+                self.book.authors,
+                quality_config.get_name_exists_ratio(self.session),
+            ),
+        )
+        b_score = max(
+            vaguely_exist_in_title(
                self.book.authors,
                b.source.title,
                quality_config.get_name_exists_ratio(self.session),
-            )
+            ),
+            fuzzy_author_narrator_match(
+                b.source.authors,
+                self.book.authors,
+                quality_config.get_name_exists_ratio(self.session),
+            ),
+        )
        if a_score == b_score:
            return self._get_next_compare(next_compare)(a, b, next_compare + 1)
        return b_score - a_score
@@ -201,20 +209,30 @@ class CompareSource:
    def _compare_narrators(
        self, a: RankSource, b: RankSource, next_compare: int
    ) -> int:
-        if(mam_config.is_active(self.session) and (a.source.narrators!=[] or b.source.narrators!=[])):
-            a_score =  get_intersection_length(a.source.authors, self.book.authors)
-            b_score =  get_intersection_length(b.source.authors, self.book.authors)
-        else:
-            a_score = vaguely_exist_in_title(
+        a_score = max(
+            vaguely_exist_in_title(
                self.book.narrators,
                a.source.title,
                quality_config.get_name_exists_ratio(self.session),
-            )
-            b_score = vaguely_exist_in_title(
+            ),
+            fuzzy_author_narrator_match(
+                a.source.narrators,
+                self.book.narrators,
+                quality_config.get_name_exists_ratio(self.session),
+            ),
+        )
+        b_score = max(
+            vaguely_exist_in_title(
                self.book.narrators,
                b.source.title,
                quality_config.get_name_exists_ratio(self.session),
-            )
+            ),
+            fuzzy_author_narrator_match(
+                b.source.narrators,
+                self.book.narrators,
+                quality_config.get_name_exists_ratio(self.session),
+            ),
+        )
        if a_score == b_score:
            return self._get_next_compare(next_compare)(a, b, next_compare + 1)
        return b_score - a_score
@@ -234,7 +252,29 @@ class CompareSource:
            return int((a.source.publish_date - b.source.publish_date).total_seconds())
        # With torrents: older => better
        return int((b.source.publish_date - a.source.publish_date).total_seconds())
-    
+
+
+def fuzzy_author_narrator_match(
+    source_people: list[str], book_people: list[str], name_exists_ratio: int
+) -> int:
+    """Calculate a fuzzy matching score between two lists of author/narrator names."""
+    if not source_people or not book_people:
+        return 0
+    score = 0
+    for book_person in book_people:
+        best_match = 0
+        for source_person in source_people:
+            match_score = fuzz.token_set_ratio(
+                book_person, source_person, processor=utils.default_process
+            )
+            best_match = max(best_match, match_score)
+
+        # Only count matches above threshold
+        if best_match > name_exists_ratio:
+            score += 1
+
+    return score
+

 def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int) -> int:
    return sum(
@@ -243,12 +283,12 @@ def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int)
        if fuzz.token_set_ratio(w, title, processor=utils.default_process)
        > name_exists_ratio
    )
-def get_intersection_length(a : list[str],b: list[str]):
+
+
+def get_intersection_length(a: list[str], b: list[str]):
    return len(set(a).intersection(set(b)))


-
-
 def exists_in_title(word: str, title: str, title_exists_ratio: int) -> bool:
    return (
        fuzz.partial_ratio(word, title, processor=utils.default_process)
--- a/app/routers/settings.py
+++ b/app/routers/settings.py
@@ -10,7 +10,7 @@ from app.internal.models import EventEnum, GroupEnum, Notification, User
 from app.internal.prowlarr.indexer_categories import indexer_categories
 from app.internal.notifications import send_notification
 from app.internal.prowlarr.prowlarr import flush_prowlarr_cache, prowlarr_config
-from app.internal.mam.mam import mam_config
+from app.internal.indexers.mam import mam_config

 from app.internal.ranking.quality import IndexerFlag, QualityRange, quality_config
 from app.util.auth import (
@@ -282,7 +282,6 @@ def read_prowlarr(
            "prowlarr_misconfigured": True if prowlarr_misconfigured else False,
            "mam_active": mam_is_active,
            "mam_id": mam_id,
-
        },
    )

@@ -336,6 +335,7 @@ def update_indexer_categories(
        block_name="category",
    )

+
@router.put("/mam/mam_id")
 def update_mam_id(
    mam_id: Annotated[str, Form()],
@@ -347,6 +347,7 @@ def update_mam_id(
    mam_config.set_mam_id(session, mam_id)
    return Response(status_code=204, headers={"HX-Refresh": "true"})

+
@router.put("/mam/activate")
 def activate_mam(
    session: Annotated[Session, Depends(get_session)],
@@ -357,6 +358,7 @@ def activate_mam(
    mam_config.set_active(session, True)
    return Response(status_code=204, headers={"HX-Refresh": "true"})

+
@router.put("/mam/deactivate")
 def deactivate_mam(
    session: Annotated[Session, Depends(get_session)],
@@ -368,7 +370,6 @@ def deactivate_mam(
    return Response(status_code=204, headers={"HX-Refresh": "true"})


-
@router.get("/download")
 def read_download(
    request: Request,
--- a/app/routers/wishlist.py
+++ b/app/routers/wishlist.py
@@ -26,9 +26,7 @@ from app.internal.prowlarr.prowlarr import (
    prowlarr_config,
    start_download,
 )
-from app.internal.mam.mam import (
-    mam_config
-)
+from app.internal.indexers.mam import mam_config
 from app.internal.query import query_sources
 from app.internal.ranking.quality import quality_config
 from app.util.auth import DetailedUser, get_authenticated_user
@@ -202,7 +200,7 @@ async def list_sources(
        {
            "book": result.book,
            "sources": result.sources,
-            "mam_active": mam_config.is_active(session)
+            "mam_active": mam_config.is_active(session),
        },
    )