Files
musicseerr/backend/repositories/coverart_artist.py
T
2026-04-03 15:53:00 +01:00

472 lines
19 KiB
Python

from __future__ import annotations
import asyncio
import logging
import re
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar
from urllib.parse import quote
import httpx
import msgspec
from core.exceptions import ExternalServiceError, RateLimitedError
from infrastructure.cache.cache_keys import ARTIST_WIKIDATA_PREFIX
from infrastructure.cache.memory_cache import CacheInterface
from infrastructure.queue.priority_queue import RequestPriority
from infrastructure.resilience.retry import CircuitOpenError
from infrastructure.validators import validate_audiodb_image_url
from infrastructure.http.disconnect import DisconnectCallable, check_disconnected
if TYPE_CHECKING:
from services.audiodb_image_service import AudioDBImageService
from repositories.musicbrainz_repository import MusicBrainzRepository
from repositories.lidarr import LidarrRepository
from repositories.jellyfin_repository import JellyfinRepository
logger = logging.getLogger(__name__)
LOCAL_SOURCE_TIMEOUT_SECONDS = 1.0
T = TypeVar("T")
DEFAULT_EXTERNAL_USER_AGENT = "Musicseerr/1.0 (contact@musicseerr.com; https://www.musicseerr.com)"
class TransientImageFetchError(Exception):
pass
TRANSIENT_FETCH_EXCEPTIONS = (
CircuitOpenError,
httpx.TimeoutException,
httpx.NetworkError,
ExternalServiceError,
RateLimitedError,
)
class _WikidataValue(msgspec.Struct):
value: str | None = None
class _WikidataSnak(msgspec.Struct):
datavalue: _WikidataValue | None = None
class _WikidataClaim(msgspec.Struct):
mainsnak: _WikidataSnak | None = None
class _WikidataClaimsResponse(msgspec.Struct):
claims: dict[str, list[_WikidataClaim]] = {}
class _CommonsImageInfo(msgspec.Struct):
url: str | None = None
thumburl: str | None = None
class _CommonsPage(msgspec.Struct):
imageinfo: list[_CommonsImageInfo] = []
class _CommonsQuery(msgspec.Struct):
pages: dict[str, _CommonsPage] = {}
class _CommonsQueryResponse(msgspec.Struct):
query: _CommonsQuery | None = None
def _decode_json_response(response: httpx.Response, decode_type: type[T]) -> T:
content = getattr(response, "content", None)
if isinstance(content, (bytes, bytearray, memoryview)):
return msgspec.json.decode(content, type=decode_type)
return msgspec.convert(response.json(), type=decode_type)
def _log_task_error(task: asyncio.Task) -> None:
if not task.cancelled() and task.exception():
logger.error(f"Background cache write failed: {task.exception()}")
def _is_valid_image_content_type(content_type: str) -> bool:
if not content_type:
return False
base_type = content_type.split(";")[0].strip().lower()
return base_type in frozenset([
"image/jpeg", "image/jpg", "image/png", "image/gif",
"image/webp", "image/avif", "image/svg+xml",
])
class ArtistImageFetcher:
def __init__(
self,
http_get_fn,
write_cache_fn,
cache: CacheInterface,
mb_repo: 'MusicBrainzRepository' | None = None,
lidarr_repo: 'LidarrRepository' | None = None,
jellyfin_repo: 'JellyfinRepository' | None = None,
audiodb_service: 'AudioDBImageService' | None = None,
user_agent: str | None = None,
):
self._http_get = http_get_fn
self._write_disk_cache = write_cache_fn
self._cache = cache
self._mb_repo = mb_repo
self._lidarr_repo = lidarr_repo
self._jellyfin_repo = jellyfin_repo
self._audiodb_service = audiodb_service
resolved_user_agent = user_agent
if not resolved_user_agent or resolved_user_agent.lower().startswith("python-httpx"):
resolved_user_agent = DEFAULT_EXTERNAL_USER_AGENT
self._external_headers = {"User-Agent": resolved_user_agent}
async def fetch_artist_image(
self,
artist_id: str,
size: int | None,
file_path: Path,
priority: RequestPriority = RequestPriority.IMAGE_FETCH,
is_disconnected: DisconnectCallable | None = None,
) -> tuple[bytes, str, str] | None:
logger.info(f"[IMG] Fetching artist image for {artist_id[:8]}... (size={size})")
result = None
had_transient_failure = False
last_transient_error: Exception | None = None
try:
await check_disconnected(is_disconnected)
result = await self._fetch_from_audiodb(artist_id, file_path, priority=priority)
except TRANSIENT_FETCH_EXCEPTIONS as exc:
had_transient_failure = True
last_transient_error = exc
logger.warning(f"[IMG:AudioDB] Transient fetch failure for {artist_id[:8]}...: {exc}")
result = None
if result:
logger.info(f"[IMG] SUCCESS from AudioDB for {artist_id[:8]}...")
return result
logger.info(f"[IMG] AudioDB failed for {artist_id[:8]}..., trying local sources")
try:
await check_disconnected(is_disconnected)
local_result, local_transient = await asyncio.wait_for(
self._fetch_local_sources(artist_id, size, file_path, priority=priority),
timeout=LOCAL_SOURCE_TIMEOUT_SECONDS,
)
if local_transient:
had_transient_failure = True
result = local_result
except TimeoutError:
logger.debug(f"[IMG] Timed out local source lookup for {artist_id[:8]}...")
had_transient_failure = True
last_transient_error = TimeoutError(
f"Timed out local source lookup for {artist_id}"
)
if result:
logger.info(f"[IMG] SUCCESS from local source for {artist_id[:8]}...")
return result
logger.info(f"[IMG] Local sources missed for {artist_id[:8]}..., trying Wikidata")
try:
await check_disconnected(is_disconnected)
result = await self._fetch_from_wikidata(artist_id, size, file_path, priority=priority)
except TRANSIENT_FETCH_EXCEPTIONS as exc:
had_transient_failure = True
last_transient_error = exc
logger.warning(f"[IMG] Transient Wikidata fetch failure for {artist_id[:8]}...: {exc}")
result = None
if result:
logger.info(f"[IMG] SUCCESS from Wikidata for {artist_id[:8]}...")
return result
logger.info(f"[IMG] FAILED: No image found for {artist_id[:8]}... from any source")
if had_transient_failure:
raise TransientImageFetchError(
f"Transient failure while fetching artist image for {artist_id}"
) from last_transient_error
return None
async def _fetch_local_sources(
self,
artist_id: str,
size: int | None,
file_path: Path,
priority: RequestPriority = RequestPriority.IMAGE_FETCH,
) -> tuple[tuple[bytes, str, str] | None, bool]:
had_transient_failure = False
try:
result = await self._fetch_from_lidarr(artist_id, size, file_path, priority=priority)
except TRANSIENT_FETCH_EXCEPTIONS as exc:
had_transient_failure = True
logger.warning(f"[IMG:Lidarr] Transient failure for {artist_id[:8]}: {exc}")
result = None
if result:
return result, had_transient_failure
try:
result = await self._fetch_from_jellyfin(artist_id, file_path, priority=priority)
except TRANSIENT_FETCH_EXCEPTIONS as exc:
had_transient_failure = True
logger.warning(f"[IMG:Jellyfin] Transient failure for {artist_id[:8]}: {exc}")
result = None
return result, had_transient_failure
async def _fetch_from_audiodb(
self,
artist_id: str,
file_path: Path,
priority: RequestPriority = RequestPriority.IMAGE_FETCH,
) -> tuple[bytes, str, str] | None:
if self._audiodb_service is None:
return None
logger.debug(f"[IMG:AudioDB] Fetching artist image for {artist_id[:8]}...")
try:
images = await self._audiodb_service.fetch_and_cache_artist_images(artist_id)
if images is None or images.is_negative or not images.thumb_url:
return None
if not validate_audiodb_image_url(images.thumb_url):
logger.warning("[IMG:AudioDB] Rejected unsafe URL for artist %s", artist_id[:8])
return None
response = await self._http_get(
images.thumb_url,
priority,
source="audiodb",
headers=self._external_headers,
)
if response.status_code != 200:
return None
content_type = response.headers.get("content-type", "")
if not _is_valid_image_content_type(content_type):
logger.warning(f"[IMG:AudioDB] Non-image content-type ({content_type}) for {artist_id[:8]}")
return None
content = response.content
task = asyncio.create_task(
self._write_disk_cache(file_path, content, content_type, {"source": "audiodb"})
)
task.add_done_callback(_log_task_error)
return (content, content_type, "audiodb")
except TRANSIENT_FETCH_EXCEPTIONS:
raise
except Exception as e: # noqa: BLE001
logger.warning(f"[IMG:AudioDB] Exception for {artist_id[:8]}: {e}")
return None
async def _fetch_from_lidarr(
self,
artist_id: str,
size: int | None,
file_path: Path,
priority: RequestPriority = RequestPriority.IMAGE_FETCH,
) -> tuple[bytes, str, str] | None:
if not self._lidarr_repo:
logger.debug(f"[IMG:Lidarr] No Lidarr repo configured for {artist_id[:8]}")
return None
if not self._lidarr_repo.is_configured():
return None
try:
image_url = await self._lidarr_repo.get_artist_image_url(artist_id, size=size or 250)
if not image_url:
logger.info(f"[IMG:Lidarr] No image URL returned for {artist_id[:8]}")
return None
logger.info(f"[IMG:Lidarr] Fetching from URL for {artist_id[:8]}...")
response = await self._http_get(
image_url,
priority,
source="lidarr",
)
if response.status_code != 200:
logger.warning(f"[IMG:Lidarr] HTTP {response.status_code} for {artist_id[:8]}")
return None
content_type = response.headers.get("content-type", "")
if not _is_valid_image_content_type(content_type):
logger.warning(f"[IMG:Lidarr] Non-image content-type ({content_type}) for {artist_id[:8]}")
return None
content = response.content
task = asyncio.create_task(self._write_disk_cache(file_path, content, content_type, {"source": "lidarr"}))
task.add_done_callback(_log_task_error)
return (content, content_type, "lidarr")
except TRANSIENT_FETCH_EXCEPTIONS:
raise
except Exception as e: # noqa: BLE001
logger.warning(f"[IMG:Lidarr] Exception for {artist_id[:8]}: {e}")
return None
async def _fetch_from_jellyfin(
self,
artist_id: str,
file_path: Path,
priority: RequestPriority = RequestPriority.IMAGE_FETCH,
) -> tuple[bytes, str, str] | None:
if not self._jellyfin_repo or not self._jellyfin_repo.is_configured():
return None
try:
artist = await self._jellyfin_repo.get_artist_by_mbid(artist_id)
if not artist:
return None
image_url = self._jellyfin_repo.get_image_url(artist.id, artist.image_tag)
if not image_url:
return None
response = await self._http_get(
image_url,
priority,
source="jellyfin",
headers=self._jellyfin_repo.get_auth_headers(),
)
if response.status_code != 200:
return None
content_type = response.headers.get("content-type", "")
if not _is_valid_image_content_type(content_type):
logger.warning(f"[IMG:Jellyfin] Non-image content-type ({content_type}) for {artist_id[:8]}")
return None
content = response.content
task = asyncio.create_task(
self._write_disk_cache(file_path, content, content_type, {"source": "jellyfin"})
)
task.add_done_callback(_log_task_error)
return (content, content_type, "jellyfin")
except TRANSIENT_FETCH_EXCEPTIONS:
raise
except Exception as e: # noqa: BLE001
logger.warning(f"[IMG:Jellyfin] Exception for {artist_id[:8]}: {e}")
return None
async def _fetch_from_wikidata(
self,
artist_id: str,
size: int | None,
file_path: Path,
priority: RequestPriority = RequestPriority.IMAGE_FETCH,
) -> tuple[bytes, str, str] | None:
cache_key = f"{ARTIST_WIKIDATA_PREFIX}{artist_id}"
wikidata_url = await self._cache.get(cache_key)
if wikidata_url is None:
wikidata_url = await self._lookup_wikidata_url(artist_id)
if wikidata_url:
await self._cache.set(cache_key, wikidata_url, ttl_seconds=86400)
if not wikidata_url:
return None
try:
match = re.search(r'/(?:wiki|entity)/(Q\d+)', wikidata_url)
wikidata_id = match.group(1) if match else None
if not wikidata_id:
logger.debug(f"Could not parse Wikidata Q-id from URL: {wikidata_url}")
return None
api_url = (
f"https://www.wikidata.org/w/api.php"
f"?action=wbgetclaims&entity={wikidata_id}&property=P18&format=json"
)
response = await self._http_get(
api_url,
priority,
source="wikidata",
headers=self._external_headers,
)
if response.status_code != 200:
return None
data = _decode_json_response(response, _WikidataClaimsResponse)
image_claims = data.claims.get("P18", [])
if not image_claims:
return None
first_claim = image_claims[0]
filename = (
first_claim.mainsnak.datavalue.value
if first_claim.mainsnak and first_claim.mainsnak.datavalue
else None
)
if not filename:
return None
commons_api = (
f"https://commons.wikimedia.org/w/api.php"
f"?action=query&titles=File:{quote(filename)}"
f"&prop=imageinfo&iiprop=url&format=json"
)
if size:
commons_api += f"&iiurlwidth={size}"
commons_response = await self._http_get(
commons_api,
priority,
source="wikimedia",
headers=self._external_headers,
)
if commons_response.status_code != 200:
return None
commons_data = _decode_json_response(commons_response, _CommonsQueryResponse)
pages = commons_data.query.pages if commons_data.query else {}
image_url = None
for page in pages.values():
imageinfo = page.imageinfo
if imageinfo:
if size and imageinfo[0].thumburl:
image_url = imageinfo[0].thumburl
else:
image_url = imageinfo[0].url
break
if not image_url:
return None
response = await self._http_get(
image_url,
priority,
source="wikimedia",
headers=self._external_headers,
)
if response.status_code == 200:
content_type = response.headers.get("content-type", "")
if not _is_valid_image_content_type(content_type):
logger.warning(f"[IMG:Wikidata] Non-image content-type ({content_type})")
return None
content = response.content
task = asyncio.create_task(
self._write_disk_cache(
file_path,
content,
content_type,
{"wikidata_id": wikidata_id, "source": "wikidata"},
)
)
task.add_done_callback(_log_task_error)
return (content, content_type, "wikidata")
except TRANSIENT_FETCH_EXCEPTIONS:
raise
except Exception as e: # noqa: BLE001
logger.error(f"Error fetching artist image for {artist_id}: {e}")
return None
async def _lookup_wikidata_url(self, artist_id: str) -> str | None:
logger.info(f"[IMG:Wikidata] Looking up wikidata URL for {artist_id[:8]}...")
if not self._mb_repo:
logger.warning(f"[IMG:Wikidata] MusicBrainz repository not available for {artist_id}")
return None
try:
artist_data = await self._mb_repo.get_artist_relations(artist_id)
if not artist_data:
logger.info(f"[IMG:Wikidata] No artist data from MB for {artist_id[:8]}")
return None
url_relations = artist_data.get("relations", [])
if url_relations:
for url_rel in url_relations:
if isinstance(url_rel, dict):
typ = url_rel.get("type") or url_rel.get("link_type")
url_obj = url_rel.get("url", {})
target = url_obj.get("resource", "") if isinstance(url_obj, dict) else ""
if typ == "wikidata" and target:
logger.info(f"[IMG:Wikidata] Found URL for {artist_id[:8]}: {target}")
return target
external_links = artist_data.get("external_links") or artist_data.get("external_links_list")
if external_links:
for ext in external_links:
try:
ext_type = getattr(ext, "type", None) if not isinstance(ext, dict) else ext.get("type")
ext_url = getattr(ext, "url", None) if not isinstance(ext, dict) else ext.get("url")
except (AttributeError, TypeError):
ext_type = None
ext_url = None
if ext_type == "wikidata" and ext_url:
return ext_url
logger.info(f"[IMG:Wikidata] No wikidata link found for {artist_id[:8]}")
return None
except TRANSIENT_FETCH_EXCEPTIONS:
raise
except Exception as e: # noqa: BLE001
logger.error(f"[IMG:Wikidata] Failed to fetch artist metadata for {artist_id}: {e}")
return None