add fuzzy matching and other suggestions

This commit is contained in:
Leandro Zazzi
2025-03-11 22:56:16 +01:00
parent 127c989837
commit a8dfd0e7e6
6 changed files with 226 additions and 183 deletions

View File

@@ -0,0 +1,154 @@
import json
import logging
from datetime import datetime
from typing import Any, Literal, Optional
from urllib.parse import urlencode, urljoin
from aiohttp import ClientSession
from sqlmodel import Session
from app.internal.models import (
TorrentSource,
ProwlarrSource,
)
from app.util.cache import SimpleCache, StringConfigCache
logger = logging.getLogger(__name__)
class MamMisconfigured(ValueError):
pass
MamConfigKey = Literal["mam_session_id", "mam_source_ttl", "mam_active"]
class MamConfig(StringConfigCache[MamConfigKey]):
def raise_if_invalid(self, session: Session):
if not self.get_session_id(session):
raise MamMisconfigured("mam_id not set")
def is_valid(self, session: Session) -> bool:
return (
self.get_session_id(session) is not None
and self.get_session_id(session) != ""
)
def get_session_id(self, session: Session) -> Optional[str]:
return self.get(session, "mam_session_id")
def set_mam_id(self, session: Session, mam_id: str):
self.set(session, "mam_session_id", mam_id)
def get_source_ttl(self, session: Session) -> int:
return self.get_int(session, "mam_source_ttl", 24 * 60 * 60)
def set_source_ttl(self, session: Session, source_ttl: int):
self.set_int(session, "mam_source_ttl", source_ttl)
def is_active(self, session: Session) -> bool:
return self.get(session, "mam_active") == "True"
def set_active(self, session: Session, state: bool):
self.set(session, "mam_active", str(state))
mam_config = MamConfig()
mam_source_cache = SimpleCache[dict[str, TorrentSource]]()
def flush_Mam_cache():
mam_source_cache.flush()
# Downloading is still handled via prowlarr.
async def query_mam(
session: Session,
client_session: ClientSession,
query: Optional[str],
force_refresh: bool = False,
) -> dict[str, TorrentSource]:
if not query:
return dict()
session_id = mam_config.get_session_id(session)
assert session_id is not None
if not force_refresh:
source_ttl = mam_config.get_source_ttl(session)
cached_sources = mam_source_cache.get(source_ttl, "mam_" + query)
if cached_sources:
return cached_sources
params: dict[str, Any] = {
"tor[text]": query, # book title + author(s)
"tor[main_cat]": [13], # MAM audiobook category
"tor[searchIn]": "torrents",
"tor[srchIn][author]": "true",
"tor[srchIn][title]": "true",
"tor[searchType]": "active", # only search for torrents with at least 1 seeder.
"startNumber": 0,
"perpage": 100,
}
base_url = "https://www.myanonamouse.net"
url = urljoin(
base_url, f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}"
)
logger.info("Querying Mam: %s", url)
async with client_session.get(url, cookies={"mam_id": session_id}) as response:
search_results = await response.json()
# Storing in dict for faster retrieval by guid
sources: dict[str, TorrentSource] = dict()
for result in search_results["data"]:
# TODO reduce to just authors / narrator unless there is a use for the other data.
sources.update(
{
f'https://www.myanonamouse.net/t/{result["id"]}': TorrentSource(
protocol="torrent",
guid=f'https://www.myanonamouse.net/t/{result["id"]}',
indexer_id=-1, # We don't know MAM's id within prowlarr.
indexer="MyAnonamouse",
title=result["title"],
seeders=result.get("seeders", 0),
leechers=result.get("leechers", 0),
size=-1,
info_url=f'https://www.myanonamouse.net/t/{result["id"]}',
indexer_flags=(
["freeleech"] if result["personal_freeleech"] == 1 else []
), # TODO add differentiate between freeleech and VIP freeleech availible flags in result: [free, fl_vip, personal_freeleech]
publish_date=datetime.fromisoformat(result["added"]),
authors=(
list(json.loads(result["author_info"]).values())
if result["author_info"]
else []
),
narrators=(
list(json.loads(result["narrator_info"]).values())
if result["narrator_info"]
else []
),
)
}
)
mam_source_cache.set(sources, "mam_" + query)
return sources
def inject_mam_metadata(
prowlarrData: list[ProwlarrSource], mamData: dict[str, TorrentSource]
) -> list[ProwlarrSource]:
for p in prowlarrData:
m = mamData.get(p.guid)
if m is None:
continue
p.authors = m.authors
p.narrators = m.narrators
return prowlarrData

View File

@@ -1,147 +0,0 @@
import json
import logging
from datetime import datetime
from typing import Any, Literal, Optional, Dict
from urllib.parse import urlencode, urljoin
from aiohttp import ClientSession
from sqlmodel import Session
from app.internal.models import (
TorrentSource,
ProwlarrSource,
)
from app.util.cache import SimpleCache, StringConfigCache
logger = logging.getLogger(__name__)
class MamMisconfigured(ValueError):
pass
MamConfigKey = Literal[
"mam_session_id",
"mam_source_ttl",
"mam_active"
]
class MamConfig(StringConfigCache[MamConfigKey]):
def raise_if_invalid(self, session: Session):
if not self.get_session_id(session):
raise MamMisconfigured("mam_id not set")
def is_valid(self, session: Session) -> bool:
return (
self.get_session_id(session) is not None and self.get_session_id(session)!=""
)
def get_session_id(self, session: Session) -> Optional[str]:
return self.get(session, "mam_session_id")
def set_mam_id(self, session: Session, mam_id: str):
self.set(session, "mam_session_id", mam_id)
def get_source_ttl(self, session: Session) -> int:
return self.get_int(session, "mam_source_ttl", 24 * 60 * 60)
def set_source_ttl(self, session: Session, source_ttl: int):
self.set_int(session, "mam_source_ttl", source_ttl)
def is_active(self, session: Session) -> bool:
return self.get(session, "mam_active")=="True"
def set_active(self, session: Session, state: bool):
self.set(session, "mam_active", str(state))
mam_config = MamConfig()
mam_source_cache = SimpleCache[dict[str, TorrentSource]]()
def flush_Mam_cache():
mam_source_cache.flush()
# Downloading is still handled via prowlarr.
async def query_mam(
session: Session,
query: Optional[str],
force_refresh: bool = False,
) -> dict[str, TorrentSource]:
if not query:
return dict()
session_id = mam_config.get_session_id(session)
assert session_id is not None
if not force_refresh:
source_ttl = mam_config.get_source_ttl(session)
cached_sources = mam_source_cache.get(source_ttl,"mam_"+query)
if cached_sources:
return cached_sources
params: dict[str, Any] = {
"tor[text]": query, # book title + author(s)
"tor[main_cat]": [13],
"tor[searchIn]": "torrents",
'tor[srchIn][author]': 'true',
'tor[srchIn][title]': 'true',
'tor[searchType]': 'active',
"startNumber": 0,
"perpage": 100,
}
base_url = "https://www.myanonamouse.net"
url = urljoin(base_url, f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}")
logger.info("Querying Mam: %s", url)
async with ClientSession() as client_session:
async with client_session.get(
url,
cookies={"mam_id":session_id}
) as response:
search_results = await response.json()
# Storing in dict for faster retrieval by guid
sources : Dict[str,TorrentSource] = dict()
for result in search_results["data"]:
# TODO reduce to just authors / narrator unless there is a use for the other data.
sources.update({
f'https://www.myanonamouse.net/t/{result["id"]}':
TorrentSource(
protocol="torrent",
guid=f'https://www.myanonamouse.net/t/{result["id"]}',
indexer_id=-1, # We don't know MAM's id within prowlarr.
indexer="MyAnonamouse",
title=result["title"],
seeders=result.get("seeders", 0),
leechers=result.get("leechers", 0),
size=-1,
info_url=f'https://www.myanonamouse.net/t/{result["id"]}',
indexer_flags=["freeleech"] if result["personal_freeleech"]==1 else [], # TODO add differentiate between freeleech and VIP freeleech availible flags in result: [free, fl_vip, personal_freeleech]
publish_date=datetime.fromisoformat(result["added"]),
authors=list(json.loads(result["author_info"]).values() ) if result["author_info"] else [],
narrators=list(json.loads(result["narrator_info"]).values()) if result["narrator_info"] else []
)
}
)
mam_source_cache.set(sources, "mam_" + query)
return sources
def inject_mam_metadata(prowlarrData: list[ProwlarrSource], mamData: Dict[str,TorrentSource]) -> list[ProwlarrSource]:
print(mamData)
for p in prowlarrData:
m =mamData.get(p.guid)
if m is None:
print("Not found: ", p.title, p.guid)
continue
p.authors= m.authors
p.narrators = m.narrators
print(m.authors, m.narrators, p.title)
return prowlarrData

View File

@@ -13,11 +13,7 @@ from app.internal.prowlarr.prowlarr import (
start_download,
)
from app.internal.mam.mam import (
mam_config,
query_mam,
inject_mam_metadata
)
from app.internal.indexers.mam import mam_config, query_mam, inject_mam_metadata
from app.internal.ranking.download_ranking import rank_sources
querying: set[str] = set()
@@ -72,10 +68,11 @@ async def query_sources(
mam_sources = await query_mam(
session,
client_session,
query,
force_refresh=force_refresh,
)
sources = inject_mam_metadata(prowlarrData=sources,mamData=mam_sources)
sources = inject_mam_metadata(prowlarrData=sources, mamData=mam_sources)
ranked = await rank_sources(session, client_session, sources, book)

View File

@@ -10,7 +10,7 @@ from sqlmodel import Session
from app.internal.models import BookRequest, ProwlarrSource
from app.internal.ranking.quality import quality_config
from app.internal.ranking.quality_extract import Quality, extract_qualities
from app.internal.mam.mam import mam_config
class RankSource(pydantic.BaseModel):
source: ProwlarrSource
@@ -178,22 +178,30 @@ class CompareSource:
return int(b_title) - int(a_title)
def _compare_authors(self, a: RankSource, b: RankSource, next_compare: int) -> int:
if(mam_config.is_active(self.session) and (a.source.authors!=[] or b.source.authors!=[])):
a_score = get_intersection_length(a.source.authors, self.book.authors)
b_score = get_intersection_length(b.source.authors, self.book.authors)
else:
a_score = vaguely_exist_in_title(
a_score = max(
vaguely_exist_in_title(
self.book.authors,
a.source.title,
quality_config.get_name_exists_ratio(self.session),
)
b_score = vaguely_exist_in_title(
),
fuzzy_author_narrator_match(
a.source.authors,
self.book.authors,
quality_config.get_name_exists_ratio(self.session),
),
)
b_score = max(
vaguely_exist_in_title(
self.book.authors,
b.source.title,
quality_config.get_name_exists_ratio(self.session),
)
),
fuzzy_author_narrator_match(
b.source.authors,
self.book.authors,
quality_config.get_name_exists_ratio(self.session),
),
)
if a_score == b_score:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return b_score - a_score
@@ -201,20 +209,30 @@ class CompareSource:
def _compare_narrators(
self, a: RankSource, b: RankSource, next_compare: int
) -> int:
if(mam_config.is_active(self.session) and (a.source.narrators!=[] or b.source.narrators!=[])):
a_score = get_intersection_length(a.source.authors, self.book.authors)
b_score = get_intersection_length(b.source.authors, self.book.authors)
else:
a_score = vaguely_exist_in_title(
a_score = max(
vaguely_exist_in_title(
self.book.narrators,
a.source.title,
quality_config.get_name_exists_ratio(self.session),
)
b_score = vaguely_exist_in_title(
),
fuzzy_author_narrator_match(
a.source.narrators,
self.book.narrators,
quality_config.get_name_exists_ratio(self.session),
),
)
b_score = max(
vaguely_exist_in_title(
self.book.narrators,
b.source.title,
quality_config.get_name_exists_ratio(self.session),
)
),
fuzzy_author_narrator_match(
b.source.narrators,
self.book.narrators,
quality_config.get_name_exists_ratio(self.session),
),
)
if a_score == b_score:
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
return b_score - a_score
@@ -234,7 +252,29 @@ class CompareSource:
return int((a.source.publish_date - b.source.publish_date).total_seconds())
# With torrents: older => better
return int((b.source.publish_date - a.source.publish_date).total_seconds())
def fuzzy_author_narrator_match(
source_people: list[str], book_people: list[str], name_exists_ratio: int
) -> int:
"""Calculate a fuzzy matching score between two lists of author/narrator names."""
if not source_people or not book_people:
return 0
score = 0
for book_person in book_people:
best_match = 0
for source_person in source_people:
match_score = fuzz.token_set_ratio(
book_person, source_person, processor=utils.default_process
)
best_match = max(best_match, match_score)
# Only count matches above threshold
if best_match > name_exists_ratio:
score += 1
return score
def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int) -> int:
return sum(
@@ -243,12 +283,12 @@ def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int)
if fuzz.token_set_ratio(w, title, processor=utils.default_process)
> name_exists_ratio
)
def get_intersection_length(a : list[str],b: list[str]):
def get_intersection_length(a: list[str], b: list[str]):
return len(set(a).intersection(set(b)))
def exists_in_title(word: str, title: str, title_exists_ratio: int) -> bool:
return (
fuzz.partial_ratio(word, title, processor=utils.default_process)

View File

@@ -10,7 +10,7 @@ from app.internal.models import EventEnum, GroupEnum, Notification, User
from app.internal.prowlarr.indexer_categories import indexer_categories
from app.internal.notifications import send_notification
from app.internal.prowlarr.prowlarr import flush_prowlarr_cache, prowlarr_config
from app.internal.mam.mam import mam_config
from app.internal.indexers.mam import mam_config
from app.internal.ranking.quality import IndexerFlag, QualityRange, quality_config
from app.util.auth import (
@@ -282,7 +282,6 @@ def read_prowlarr(
"prowlarr_misconfigured": True if prowlarr_misconfigured else False,
"mam_active": mam_is_active,
"mam_id": mam_id,
},
)
@@ -336,6 +335,7 @@ def update_indexer_categories(
block_name="category",
)
@router.put("/mam/mam_id")
def update_mam_id(
mam_id: Annotated[str, Form()],
@@ -347,6 +347,7 @@ def update_mam_id(
mam_config.set_mam_id(session, mam_id)
return Response(status_code=204, headers={"HX-Refresh": "true"})
@router.put("/mam/activate")
def activate_mam(
session: Annotated[Session, Depends(get_session)],
@@ -357,6 +358,7 @@ def activate_mam(
mam_config.set_active(session, True)
return Response(status_code=204, headers={"HX-Refresh": "true"})
@router.put("/mam/deactivate")
def deactivate_mam(
session: Annotated[Session, Depends(get_session)],
@@ -368,7 +370,6 @@ def deactivate_mam(
return Response(status_code=204, headers={"HX-Refresh": "true"})
@router.get("/download")
def read_download(
request: Request,

View File

@@ -26,9 +26,7 @@ from app.internal.prowlarr.prowlarr import (
prowlarr_config,
start_download,
)
from app.internal.mam.mam import (
mam_config
)
from app.internal.indexers.mam import mam_config
from app.internal.query import query_sources
from app.internal.ranking.quality import quality_config
from app.util.auth import DetailedUser, get_authenticated_user
@@ -202,7 +200,7 @@ async def list_sources(
{
"book": result.book,
"sources": result.sources,
"mam_active": mam_config.is_active(session)
"mam_active": mam_config.is_active(session),
},
)