mirror of
https://github.com/markbeep/AudioBookRequest.git
synced 2026-01-08 06:29:41 -06:00
add fuzzy matching and other suggestions
This commit is contained in:
154
app/internal/indexers/mam.py
Normal file
154
app/internal/indexers/mam.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal, Optional
|
||||
from urllib.parse import urlencode, urljoin
|
||||
|
||||
from aiohttp import ClientSession
|
||||
from sqlmodel import Session
|
||||
|
||||
from app.internal.models import (
|
||||
TorrentSource,
|
||||
ProwlarrSource,
|
||||
)
|
||||
from app.util.cache import SimpleCache, StringConfigCache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MamMisconfigured(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
MamConfigKey = Literal["mam_session_id", "mam_source_ttl", "mam_active"]
|
||||
|
||||
|
||||
class MamConfig(StringConfigCache[MamConfigKey]):
|
||||
def raise_if_invalid(self, session: Session):
|
||||
if not self.get_session_id(session):
|
||||
raise MamMisconfigured("mam_id not set")
|
||||
|
||||
def is_valid(self, session: Session) -> bool:
|
||||
return (
|
||||
self.get_session_id(session) is not None
|
||||
and self.get_session_id(session) != ""
|
||||
)
|
||||
|
||||
def get_session_id(self, session: Session) -> Optional[str]:
|
||||
return self.get(session, "mam_session_id")
|
||||
|
||||
def set_mam_id(self, session: Session, mam_id: str):
|
||||
self.set(session, "mam_session_id", mam_id)
|
||||
|
||||
def get_source_ttl(self, session: Session) -> int:
|
||||
return self.get_int(session, "mam_source_ttl", 24 * 60 * 60)
|
||||
|
||||
def set_source_ttl(self, session: Session, source_ttl: int):
|
||||
self.set_int(session, "mam_source_ttl", source_ttl)
|
||||
|
||||
def is_active(self, session: Session) -> bool:
|
||||
return self.get(session, "mam_active") == "True"
|
||||
|
||||
def set_active(self, session: Session, state: bool):
|
||||
self.set(session, "mam_active", str(state))
|
||||
|
||||
|
||||
mam_config = MamConfig()
|
||||
mam_source_cache = SimpleCache[dict[str, TorrentSource]]()
|
||||
|
||||
|
||||
def flush_Mam_cache():
|
||||
mam_source_cache.flush()
|
||||
|
||||
|
||||
# Downloading is still handled via prowlarr.
|
||||
|
||||
|
||||
async def query_mam(
|
||||
session: Session,
|
||||
client_session: ClientSession,
|
||||
query: Optional[str],
|
||||
force_refresh: bool = False,
|
||||
) -> dict[str, TorrentSource]:
|
||||
if not query:
|
||||
return dict()
|
||||
|
||||
session_id = mam_config.get_session_id(session)
|
||||
assert session_id is not None
|
||||
|
||||
if not force_refresh:
|
||||
source_ttl = mam_config.get_source_ttl(session)
|
||||
cached_sources = mam_source_cache.get(source_ttl, "mam_" + query)
|
||||
if cached_sources:
|
||||
return cached_sources
|
||||
params: dict[str, Any] = {
|
||||
"tor[text]": query, # book title + author(s)
|
||||
"tor[main_cat]": [13], # MAM audiobook category
|
||||
"tor[searchIn]": "torrents",
|
||||
"tor[srchIn][author]": "true",
|
||||
"tor[srchIn][title]": "true",
|
||||
"tor[searchType]": "active", # only search for torrents with at least 1 seeder.
|
||||
"startNumber": 0,
|
||||
"perpage": 100,
|
||||
}
|
||||
|
||||
base_url = "https://www.myanonamouse.net"
|
||||
url = urljoin(
|
||||
base_url, f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}"
|
||||
)
|
||||
|
||||
logger.info("Querying Mam: %s", url)
|
||||
|
||||
async with client_session.get(url, cookies={"mam_id": session_id}) as response:
|
||||
search_results = await response.json()
|
||||
# Storing in dict for faster retrieval by guid
|
||||
sources: dict[str, TorrentSource] = dict()
|
||||
|
||||
for result in search_results["data"]:
|
||||
# TODO reduce to just authors / narrator unless there is a use for the other data.
|
||||
sources.update(
|
||||
{
|
||||
f'https://www.myanonamouse.net/t/{result["id"]}': TorrentSource(
|
||||
protocol="torrent",
|
||||
guid=f'https://www.myanonamouse.net/t/{result["id"]}',
|
||||
indexer_id=-1, # We don't know MAM's id within prowlarr.
|
||||
indexer="MyAnonamouse",
|
||||
title=result["title"],
|
||||
seeders=result.get("seeders", 0),
|
||||
leechers=result.get("leechers", 0),
|
||||
size=-1,
|
||||
info_url=f'https://www.myanonamouse.net/t/{result["id"]}',
|
||||
indexer_flags=(
|
||||
["freeleech"] if result["personal_freeleech"] == 1 else []
|
||||
), # TODO add differentiate between freeleech and VIP freeleech availible flags in result: [free, fl_vip, personal_freeleech]
|
||||
publish_date=datetime.fromisoformat(result["added"]),
|
||||
authors=(
|
||||
list(json.loads(result["author_info"]).values())
|
||||
if result["author_info"]
|
||||
else []
|
||||
),
|
||||
narrators=(
|
||||
list(json.loads(result["narrator_info"]).values())
|
||||
if result["narrator_info"]
|
||||
else []
|
||||
),
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
mam_source_cache.set(sources, "mam_" + query)
|
||||
|
||||
return sources
|
||||
|
||||
|
||||
def inject_mam_metadata(
|
||||
prowlarrData: list[ProwlarrSource], mamData: dict[str, TorrentSource]
|
||||
) -> list[ProwlarrSource]:
|
||||
for p in prowlarrData:
|
||||
m = mamData.get(p.guid)
|
||||
if m is None:
|
||||
continue
|
||||
p.authors = m.authors
|
||||
p.narrators = m.narrators
|
||||
|
||||
return prowlarrData
|
||||
@@ -1,147 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal, Optional, Dict
|
||||
from urllib.parse import urlencode, urljoin
|
||||
|
||||
from aiohttp import ClientSession
|
||||
from sqlmodel import Session
|
||||
|
||||
from app.internal.models import (
|
||||
TorrentSource,
|
||||
ProwlarrSource,
|
||||
)
|
||||
from app.util.cache import SimpleCache, StringConfigCache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MamMisconfigured(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
MamConfigKey = Literal[
|
||||
"mam_session_id",
|
||||
"mam_source_ttl",
|
||||
"mam_active"
|
||||
]
|
||||
|
||||
|
||||
class MamConfig(StringConfigCache[MamConfigKey]):
|
||||
def raise_if_invalid(self, session: Session):
|
||||
if not self.get_session_id(session):
|
||||
raise MamMisconfigured("mam_id not set")
|
||||
|
||||
def is_valid(self, session: Session) -> bool:
|
||||
return (
|
||||
self.get_session_id(session) is not None and self.get_session_id(session)!=""
|
||||
)
|
||||
|
||||
def get_session_id(self, session: Session) -> Optional[str]:
|
||||
return self.get(session, "mam_session_id")
|
||||
|
||||
def set_mam_id(self, session: Session, mam_id: str):
|
||||
self.set(session, "mam_session_id", mam_id)
|
||||
def get_source_ttl(self, session: Session) -> int:
|
||||
return self.get_int(session, "mam_source_ttl", 24 * 60 * 60)
|
||||
|
||||
def set_source_ttl(self, session: Session, source_ttl: int):
|
||||
self.set_int(session, "mam_source_ttl", source_ttl)
|
||||
def is_active(self, session: Session) -> bool:
|
||||
return self.get(session, "mam_active")=="True"
|
||||
def set_active(self, session: Session, state: bool):
|
||||
self.set(session, "mam_active", str(state))
|
||||
|
||||
|
||||
mam_config = MamConfig()
|
||||
mam_source_cache = SimpleCache[dict[str, TorrentSource]]()
|
||||
|
||||
|
||||
def flush_Mam_cache():
|
||||
mam_source_cache.flush()
|
||||
|
||||
# Downloading is still handled via prowlarr.
|
||||
|
||||
async def query_mam(
|
||||
session: Session,
|
||||
query: Optional[str],
|
||||
force_refresh: bool = False,
|
||||
) -> dict[str, TorrentSource]:
|
||||
if not query:
|
||||
return dict()
|
||||
|
||||
session_id = mam_config.get_session_id(session)
|
||||
assert session_id is not None
|
||||
|
||||
if not force_refresh:
|
||||
source_ttl = mam_config.get_source_ttl(session)
|
||||
cached_sources = mam_source_cache.get(source_ttl,"mam_"+query)
|
||||
if cached_sources:
|
||||
return cached_sources
|
||||
params: dict[str, Any] = {
|
||||
"tor[text]": query, # book title + author(s)
|
||||
|
||||
"tor[main_cat]": [13],
|
||||
"tor[searchIn]": "torrents",
|
||||
'tor[srchIn][author]': 'true',
|
||||
'tor[srchIn][title]': 'true',
|
||||
'tor[searchType]': 'active',
|
||||
"startNumber": 0,
|
||||
"perpage": 100,
|
||||
}
|
||||
|
||||
base_url = "https://www.myanonamouse.net"
|
||||
url = urljoin(base_url, f"/tor/js/loadSearchJSONbasic.php?{urlencode(params, doseq=True)}")
|
||||
|
||||
logger.info("Querying Mam: %s", url)
|
||||
async with ClientSession() as client_session:
|
||||
|
||||
async with client_session.get(
|
||||
url,
|
||||
cookies={"mam_id":session_id}
|
||||
) as response:
|
||||
search_results = await response.json()
|
||||
# Storing in dict for faster retrieval by guid
|
||||
sources : Dict[str,TorrentSource] = dict()
|
||||
|
||||
for result in search_results["data"]:
|
||||
# TODO reduce to just authors / narrator unless there is a use for the other data.
|
||||
sources.update({
|
||||
f'https://www.myanonamouse.net/t/{result["id"]}':
|
||||
TorrentSource(
|
||||
protocol="torrent",
|
||||
guid=f'https://www.myanonamouse.net/t/{result["id"]}',
|
||||
indexer_id=-1, # We don't know MAM's id within prowlarr.
|
||||
indexer="MyAnonamouse",
|
||||
title=result["title"],
|
||||
seeders=result.get("seeders", 0),
|
||||
leechers=result.get("leechers", 0),
|
||||
size=-1,
|
||||
info_url=f'https://www.myanonamouse.net/t/{result["id"]}',
|
||||
indexer_flags=["freeleech"] if result["personal_freeleech"]==1 else [], # TODO add differentiate between freeleech and VIP freeleech availible flags in result: [free, fl_vip, personal_freeleech]
|
||||
publish_date=datetime.fromisoformat(result["added"]),
|
||||
authors=list(json.loads(result["author_info"]).values() ) if result["author_info"] else [],
|
||||
narrators=list(json.loads(result["narrator_info"]).values()) if result["narrator_info"] else []
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
mam_source_cache.set(sources, "mam_" + query)
|
||||
|
||||
return sources
|
||||
|
||||
|
||||
def inject_mam_metadata(prowlarrData: list[ProwlarrSource], mamData: Dict[str,TorrentSource]) -> list[ProwlarrSource]:
|
||||
print(mamData)
|
||||
for p in prowlarrData:
|
||||
m =mamData.get(p.guid)
|
||||
if m is None:
|
||||
print("Not found: ", p.title, p.guid)
|
||||
continue
|
||||
p.authors= m.authors
|
||||
p.narrators = m.narrators
|
||||
print(m.authors, m.narrators, p.title)
|
||||
|
||||
return prowlarrData
|
||||
|
||||
@@ -13,11 +13,7 @@ from app.internal.prowlarr.prowlarr import (
|
||||
start_download,
|
||||
)
|
||||
|
||||
from app.internal.mam.mam import (
|
||||
mam_config,
|
||||
query_mam,
|
||||
inject_mam_metadata
|
||||
)
|
||||
from app.internal.indexers.mam import mam_config, query_mam, inject_mam_metadata
|
||||
from app.internal.ranking.download_ranking import rank_sources
|
||||
|
||||
querying: set[str] = set()
|
||||
@@ -72,10 +68,11 @@ async def query_sources(
|
||||
|
||||
mam_sources = await query_mam(
|
||||
session,
|
||||
client_session,
|
||||
query,
|
||||
force_refresh=force_refresh,
|
||||
)
|
||||
sources = inject_mam_metadata(prowlarrData=sources,mamData=mam_sources)
|
||||
sources = inject_mam_metadata(prowlarrData=sources, mamData=mam_sources)
|
||||
|
||||
ranked = await rank_sources(session, client_session, sources, book)
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ from sqlmodel import Session
|
||||
from app.internal.models import BookRequest, ProwlarrSource
|
||||
from app.internal.ranking.quality import quality_config
|
||||
from app.internal.ranking.quality_extract import Quality, extract_qualities
|
||||
from app.internal.mam.mam import mam_config
|
||||
|
||||
|
||||
class RankSource(pydantic.BaseModel):
|
||||
source: ProwlarrSource
|
||||
@@ -178,22 +178,30 @@ class CompareSource:
|
||||
return int(b_title) - int(a_title)
|
||||
|
||||
def _compare_authors(self, a: RankSource, b: RankSource, next_compare: int) -> int:
|
||||
if(mam_config.is_active(self.session) and (a.source.authors!=[] or b.source.authors!=[])):
|
||||
a_score = get_intersection_length(a.source.authors, self.book.authors)
|
||||
b_score = get_intersection_length(b.source.authors, self.book.authors)
|
||||
else:
|
||||
|
||||
|
||||
a_score = vaguely_exist_in_title(
|
||||
a_score = max(
|
||||
vaguely_exist_in_title(
|
||||
self.book.authors,
|
||||
a.source.title,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
)
|
||||
b_score = vaguely_exist_in_title(
|
||||
),
|
||||
fuzzy_author_narrator_match(
|
||||
a.source.authors,
|
||||
self.book.authors,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
),
|
||||
)
|
||||
b_score = max(
|
||||
vaguely_exist_in_title(
|
||||
self.book.authors,
|
||||
b.source.title,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
)
|
||||
),
|
||||
fuzzy_author_narrator_match(
|
||||
b.source.authors,
|
||||
self.book.authors,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
),
|
||||
)
|
||||
if a_score == b_score:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return b_score - a_score
|
||||
@@ -201,20 +209,30 @@ class CompareSource:
|
||||
def _compare_narrators(
|
||||
self, a: RankSource, b: RankSource, next_compare: int
|
||||
) -> int:
|
||||
if(mam_config.is_active(self.session) and (a.source.narrators!=[] or b.source.narrators!=[])):
|
||||
a_score = get_intersection_length(a.source.authors, self.book.authors)
|
||||
b_score = get_intersection_length(b.source.authors, self.book.authors)
|
||||
else:
|
||||
a_score = vaguely_exist_in_title(
|
||||
a_score = max(
|
||||
vaguely_exist_in_title(
|
||||
self.book.narrators,
|
||||
a.source.title,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
)
|
||||
b_score = vaguely_exist_in_title(
|
||||
),
|
||||
fuzzy_author_narrator_match(
|
||||
a.source.narrators,
|
||||
self.book.narrators,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
),
|
||||
)
|
||||
b_score = max(
|
||||
vaguely_exist_in_title(
|
||||
self.book.narrators,
|
||||
b.source.title,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
)
|
||||
),
|
||||
fuzzy_author_narrator_match(
|
||||
b.source.narrators,
|
||||
self.book.narrators,
|
||||
quality_config.get_name_exists_ratio(self.session),
|
||||
),
|
||||
)
|
||||
if a_score == b_score:
|
||||
return self._get_next_compare(next_compare)(a, b, next_compare + 1)
|
||||
return b_score - a_score
|
||||
@@ -234,7 +252,29 @@ class CompareSource:
|
||||
return int((a.source.publish_date - b.source.publish_date).total_seconds())
|
||||
# With torrents: older => better
|
||||
return int((b.source.publish_date - a.source.publish_date).total_seconds())
|
||||
|
||||
|
||||
|
||||
def fuzzy_author_narrator_match(
|
||||
source_people: list[str], book_people: list[str], name_exists_ratio: int
|
||||
) -> int:
|
||||
"""Calculate a fuzzy matching score between two lists of author/narrator names."""
|
||||
if not source_people or not book_people:
|
||||
return 0
|
||||
score = 0
|
||||
for book_person in book_people:
|
||||
best_match = 0
|
||||
for source_person in source_people:
|
||||
match_score = fuzz.token_set_ratio(
|
||||
book_person, source_person, processor=utils.default_process
|
||||
)
|
||||
best_match = max(best_match, match_score)
|
||||
|
||||
# Only count matches above threshold
|
||||
if best_match > name_exists_ratio:
|
||||
score += 1
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int) -> int:
|
||||
return sum(
|
||||
@@ -243,12 +283,12 @@ def vaguely_exist_in_title(words: list[str], title: str, name_exists_ratio: int)
|
||||
if fuzz.token_set_ratio(w, title, processor=utils.default_process)
|
||||
> name_exists_ratio
|
||||
)
|
||||
def get_intersection_length(a : list[str],b: list[str]):
|
||||
|
||||
|
||||
def get_intersection_length(a: list[str], b: list[str]):
|
||||
return len(set(a).intersection(set(b)))
|
||||
|
||||
|
||||
|
||||
|
||||
def exists_in_title(word: str, title: str, title_exists_ratio: int) -> bool:
|
||||
return (
|
||||
fuzz.partial_ratio(word, title, processor=utils.default_process)
|
||||
|
||||
@@ -10,7 +10,7 @@ from app.internal.models import EventEnum, GroupEnum, Notification, User
|
||||
from app.internal.prowlarr.indexer_categories import indexer_categories
|
||||
from app.internal.notifications import send_notification
|
||||
from app.internal.prowlarr.prowlarr import flush_prowlarr_cache, prowlarr_config
|
||||
from app.internal.mam.mam import mam_config
|
||||
from app.internal.indexers.mam import mam_config
|
||||
|
||||
from app.internal.ranking.quality import IndexerFlag, QualityRange, quality_config
|
||||
from app.util.auth import (
|
||||
@@ -282,7 +282,6 @@ def read_prowlarr(
|
||||
"prowlarr_misconfigured": True if prowlarr_misconfigured else False,
|
||||
"mam_active": mam_is_active,
|
||||
"mam_id": mam_id,
|
||||
|
||||
},
|
||||
)
|
||||
|
||||
@@ -336,6 +335,7 @@ def update_indexer_categories(
|
||||
block_name="category",
|
||||
)
|
||||
|
||||
|
||||
@router.put("/mam/mam_id")
|
||||
def update_mam_id(
|
||||
mam_id: Annotated[str, Form()],
|
||||
@@ -347,6 +347,7 @@ def update_mam_id(
|
||||
mam_config.set_mam_id(session, mam_id)
|
||||
return Response(status_code=204, headers={"HX-Refresh": "true"})
|
||||
|
||||
|
||||
@router.put("/mam/activate")
|
||||
def activate_mam(
|
||||
session: Annotated[Session, Depends(get_session)],
|
||||
@@ -357,6 +358,7 @@ def activate_mam(
|
||||
mam_config.set_active(session, True)
|
||||
return Response(status_code=204, headers={"HX-Refresh": "true"})
|
||||
|
||||
|
||||
@router.put("/mam/deactivate")
|
||||
def deactivate_mam(
|
||||
session: Annotated[Session, Depends(get_session)],
|
||||
@@ -368,7 +370,6 @@ def deactivate_mam(
|
||||
return Response(status_code=204, headers={"HX-Refresh": "true"})
|
||||
|
||||
|
||||
|
||||
@router.get("/download")
|
||||
def read_download(
|
||||
request: Request,
|
||||
|
||||
@@ -26,9 +26,7 @@ from app.internal.prowlarr.prowlarr import (
|
||||
prowlarr_config,
|
||||
start_download,
|
||||
)
|
||||
from app.internal.mam.mam import (
|
||||
mam_config
|
||||
)
|
||||
from app.internal.indexers.mam import mam_config
|
||||
from app.internal.query import query_sources
|
||||
from app.internal.ranking.quality import quality_config
|
||||
from app.util.auth import DetailedUser, get_authenticated_user
|
||||
@@ -202,7 +200,7 @@ async def list_sources(
|
||||
{
|
||||
"book": result.book,
|
||||
"sources": result.sources,
|
||||
"mam_active": mam_config.is_active(session)
|
||||
"mam_active": mam_config.is_active(session),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user