musicseerr/backend/repositories/coverart_artist.py

from __future__ import annotations

import asyncio
import logging
import re
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar
from urllib.parse import quote

import httpx
import msgspec

from core.exceptions import ExternalServiceError, RateLimitedError
from infrastructure.cache.cache_keys import ARTIST_WIKIDATA_PREFIX
from infrastructure.cache.memory_cache import CacheInterface
from infrastructure.queue.priority_queue import RequestPriority
from infrastructure.resilience.retry import CircuitOpenError
from infrastructure.validators import validate_audiodb_image_url
from infrastructure.http.disconnect import DisconnectCallable, check_disconnected

if TYPE_CHECKING:
    from services.audiodb_image_service import AudioDBImageService
    from repositories.musicbrainz_repository import MusicBrainzRepository
    from repositories.lidarr import LidarrRepository
    from repositories.jellyfin_repository import JellyfinRepository

logger = logging.getLogger(__name__)
LOCAL_SOURCE_TIMEOUT_SECONDS = 1.0
T = TypeVar("T")
DEFAULT_EXTERNAL_USER_AGENT = "Musicseerr/1.0 (contact@musicseerr.com; https://www.musicseerr.com)"


class TransientImageFetchError(Exception):
    pass


TRANSIENT_FETCH_EXCEPTIONS = (
    CircuitOpenError,
    httpx.TimeoutException,
    httpx.NetworkError,
    ExternalServiceError,
    RateLimitedError,
)


class _WikidataValue(msgspec.Struct):
    value: str | None = None


class _WikidataSnak(msgspec.Struct):
    datavalue: _WikidataValue | None = None


class _WikidataClaim(msgspec.Struct):
    mainsnak: _WikidataSnak | None = None


class _WikidataClaimsResponse(msgspec.Struct):
    claims: dict[str, list[_WikidataClaim]] = {}


class _CommonsImageInfo(msgspec.Struct):
    url: str | None = None
    thumburl: str | None = None


class _CommonsPage(msgspec.Struct):
    imageinfo: list[_CommonsImageInfo] = []


class _CommonsQuery(msgspec.Struct):
    pages: dict[str, _CommonsPage] = {}


class _CommonsQueryResponse(msgspec.Struct):
    query: _CommonsQuery | None = None


def _decode_json_response(response: httpx.Response, decode_type: type[T]) -> T:
    content = getattr(response, "content", None)
    if isinstance(content, (bytes, bytearray, memoryview)):
        return msgspec.json.decode(content, type=decode_type)
    return msgspec.convert(response.json(), type=decode_type)


def _log_task_error(task: asyncio.Task) -> None:
    if not task.cancelled() and task.exception():
        logger.error(f"Background cache write failed: {task.exception()}")


def _is_valid_image_content_type(content_type: str) -> bool:
    if not content_type:
        return False
    base_type = content_type.split(";")[0].strip().lower()
    return base_type in frozenset([
        "image/jpeg", "image/jpg", "image/png", "image/gif",
        "image/webp", "image/avif", "image/svg+xml",
    ])


class ArtistImageFetcher:
    def __init__(
        self,
        http_get_fn,
        write_cache_fn,
        cache: CacheInterface,
        mb_repo: 'MusicBrainzRepository' | None = None,
        lidarr_repo: 'LidarrRepository' | None = None,
        jellyfin_repo: 'JellyfinRepository' | None = None,
        audiodb_service: 'AudioDBImageService' | None = None,
        user_agent: str | None = None,
    ):
        self._http_get = http_get_fn
        self._write_disk_cache = write_cache_fn
        self._cache = cache
        self._mb_repo = mb_repo
        self._lidarr_repo = lidarr_repo
        self._jellyfin_repo = jellyfin_repo
        self._audiodb_service = audiodb_service
        resolved_user_agent = user_agent
        if not resolved_user_agent or resolved_user_agent.lower().startswith("python-httpx"):
            resolved_user_agent = DEFAULT_EXTERNAL_USER_AGENT
        self._external_headers = {"User-Agent": resolved_user_agent}

    async def fetch_artist_image(
        self,
        artist_id: str,
        size: int | None,
        file_path: Path,
        priority: RequestPriority = RequestPriority.IMAGE_FETCH,
        is_disconnected: DisconnectCallable | None = None,
    ) -> tuple[bytes, str, str] | None:
        logger.info(f"[IMG] Fetching artist image for {artist_id[:8]}... (size={size})")
        result = None
        had_transient_failure = False
        last_transient_error: Exception | None = None
        try:
            await check_disconnected(is_disconnected)
            result = await self._fetch_from_audiodb(artist_id, file_path, priority=priority)
        except TRANSIENT_FETCH_EXCEPTIONS as exc:
            had_transient_failure = True
            last_transient_error = exc
            logger.warning(f"[IMG:AudioDB] Transient fetch failure for {artist_id[:8]}...: {exc}")
            result = None
        if result:
            logger.info(f"[IMG] SUCCESS from AudioDB for {artist_id[:8]}...")
            return result
        logger.info(f"[IMG] AudioDB failed for {artist_id[:8]}..., trying local sources")
        try:
            await check_disconnected(is_disconnected)
            local_result, local_transient = await asyncio.wait_for(
                self._fetch_local_sources(artist_id, size, file_path, priority=priority),
                timeout=LOCAL_SOURCE_TIMEOUT_SECONDS,
            )
            if local_transient:
                had_transient_failure = True
            result = local_result
        except TimeoutError:
            logger.debug(f"[IMG] Timed out local source lookup for {artist_id[:8]}...")
            had_transient_failure = True
            last_transient_error = TimeoutError(
                f"Timed out local source lookup for {artist_id}"
            )
        if result:
            logger.info(f"[IMG] SUCCESS from local source for {artist_id[:8]}...")
            return result
        logger.info(f"[IMG] Local sources missed for {artist_id[:8]}..., trying Wikidata")
        try:
            await check_disconnected(is_disconnected)
            result = await self._fetch_from_wikidata(artist_id, size, file_path, priority=priority)
        except TRANSIENT_FETCH_EXCEPTIONS as exc:
            had_transient_failure = True
            last_transient_error = exc
            logger.warning(f"[IMG] Transient Wikidata fetch failure for {artist_id[:8]}...: {exc}")
            result = None
        if result:
            logger.info(f"[IMG] SUCCESS from Wikidata for {artist_id[:8]}...")
            return result
        logger.info(f"[IMG] FAILED: No image found for {artist_id[:8]}... from any source")
        if had_transient_failure:
            raise TransientImageFetchError(
                f"Transient failure while fetching artist image for {artist_id}"
            ) from last_transient_error
        return None

    async def _fetch_local_sources(
        self,
        artist_id: str,
        size: int | None,
        file_path: Path,
        priority: RequestPriority = RequestPriority.IMAGE_FETCH,
    ) -> tuple[tuple[bytes, str, str] | None, bool]:
        had_transient_failure = False

        try:
            result = await self._fetch_from_lidarr(artist_id, size, file_path, priority=priority)
        except TRANSIENT_FETCH_EXCEPTIONS as exc:
            had_transient_failure = True
            logger.warning(f"[IMG:Lidarr] Transient failure for {artist_id[:8]}: {exc}")
            result = None

        if result:
            return result, had_transient_failure

        try:
            result = await self._fetch_from_jellyfin(artist_id, file_path, priority=priority)
        except TRANSIENT_FETCH_EXCEPTIONS as exc:
            had_transient_failure = True
            logger.warning(f"[IMG:Jellyfin] Transient failure for {artist_id[:8]}: {exc}")
            result = None

        return result, had_transient_failure

    async def _fetch_from_audiodb(
        self,
        artist_id: str,
        file_path: Path,
        priority: RequestPriority = RequestPriority.IMAGE_FETCH,
    ) -> tuple[bytes, str, str] | None:
        if self._audiodb_service is None:
            return None
        logger.debug(f"[IMG:AudioDB] Fetching artist image for {artist_id[:8]}...")
        try:
            images = await self._audiodb_service.fetch_and_cache_artist_images(artist_id)
            if images is None or images.is_negative or not images.thumb_url:
                return None
            if not validate_audiodb_image_url(images.thumb_url):
                logger.warning("[IMG:AudioDB] Rejected unsafe URL for artist %s", artist_id[:8])
                return None
            response = await self._http_get(
                images.thumb_url,
                priority,
                source="audiodb",
                headers=self._external_headers,
            )
            if response.status_code != 200:
                return None
            content_type = response.headers.get("content-type", "")
            if not _is_valid_image_content_type(content_type):
                logger.warning(f"[IMG:AudioDB] Non-image content-type ({content_type}) for {artist_id[:8]}")
                return None
            content = response.content
            task = asyncio.create_task(
                self._write_disk_cache(file_path, content, content_type, {"source": "audiodb"})
            )
            task.add_done_callback(_log_task_error)
            return (content, content_type, "audiodb")
        except TRANSIENT_FETCH_EXCEPTIONS:
            raise
        except Exception as e:  # noqa: BLE001
            logger.warning(f"[IMG:AudioDB] Exception for {artist_id[:8]}: {e}")
            return None

    async def _fetch_from_lidarr(
        self,
        artist_id: str,
        size: int | None,
        file_path: Path,
        priority: RequestPriority = RequestPriority.IMAGE_FETCH,
    ) -> tuple[bytes, str, str] | None:
        if not self._lidarr_repo:
            logger.debug(f"[IMG:Lidarr] No Lidarr repo configured for {artist_id[:8]}")
            return None
        if not self._lidarr_repo.is_configured():
            return None
        try:
            image_url = await self._lidarr_repo.get_artist_image_url(artist_id, size=size or 250)
            if not image_url:
                logger.info(f"[IMG:Lidarr] No image URL returned for {artist_id[:8]}")
                return None
            logger.info(f"[IMG:Lidarr] Fetching from URL for {artist_id[:8]}...")
            response = await self._http_get(
                image_url,
                priority,
                source="lidarr",
            )
            if response.status_code != 200:
                logger.warning(f"[IMG:Lidarr] HTTP {response.status_code} for {artist_id[:8]}")
                return None
            content_type = response.headers.get("content-type", "")
            if not _is_valid_image_content_type(content_type):
                logger.warning(f"[IMG:Lidarr] Non-image content-type ({content_type}) for {artist_id[:8]}")
                return None
            content = response.content
            task = asyncio.create_task(self._write_disk_cache(file_path, content, content_type, {"source": "lidarr"}))
            task.add_done_callback(_log_task_error)
            return (content, content_type, "lidarr")
        except TRANSIENT_FETCH_EXCEPTIONS:
            raise
        except Exception as e:  # noqa: BLE001
            logger.warning(f"[IMG:Lidarr] Exception for {artist_id[:8]}: {e}")
            return None

    async def _fetch_from_jellyfin(
        self,
        artist_id: str,
        file_path: Path,
        priority: RequestPriority = RequestPriority.IMAGE_FETCH,
    ) -> tuple[bytes, str, str] | None:
        if not self._jellyfin_repo or not self._jellyfin_repo.is_configured():
            return None
        try:
            artist = await self._jellyfin_repo.get_artist_by_mbid(artist_id)
            if not artist:
                return None
            image_url = self._jellyfin_repo.get_image_url(artist.id, artist.image_tag)
            if not image_url:
                return None
            response = await self._http_get(
                image_url,
                priority,
                source="jellyfin",
                headers=self._jellyfin_repo.get_auth_headers(),
            )
            if response.status_code != 200:
                return None
            content_type = response.headers.get("content-type", "")
            if not _is_valid_image_content_type(content_type):
                logger.warning(f"[IMG:Jellyfin] Non-image content-type ({content_type}) for {artist_id[:8]}")
                return None
            content = response.content
            task = asyncio.create_task(
                self._write_disk_cache(file_path, content, content_type, {"source": "jellyfin"})
            )
            task.add_done_callback(_log_task_error)
            return (content, content_type, "jellyfin")
        except TRANSIENT_FETCH_EXCEPTIONS:
            raise
        except Exception as e:  # noqa: BLE001
            logger.warning(f"[IMG:Jellyfin] Exception for {artist_id[:8]}: {e}")
            return None

    async def _fetch_from_wikidata(
        self,
        artist_id: str,
        size: int | None,
        file_path: Path,
        priority: RequestPriority = RequestPriority.IMAGE_FETCH,
    ) -> tuple[bytes, str, str] | None:
        cache_key = f"{ARTIST_WIKIDATA_PREFIX}{artist_id}"
        wikidata_url = await self._cache.get(cache_key)
        if wikidata_url is None:
            wikidata_url = await self._lookup_wikidata_url(artist_id)
            if wikidata_url:
                await self._cache.set(cache_key, wikidata_url, ttl_seconds=86400)
        if not wikidata_url:
            return None
        try:
            match = re.search(r'/(?:wiki|entity)/(Q\d+)', wikidata_url)
            wikidata_id = match.group(1) if match else None
            if not wikidata_id:
                logger.debug(f"Could not parse Wikidata Q-id from URL: {wikidata_url}")
                return None
            api_url = (
                f"https://www.wikidata.org/w/api.php"
                f"?action=wbgetclaims&entity={wikidata_id}&property=P18&format=json"
            )
            response = await self._http_get(
                api_url,
                priority,
                source="wikidata",
                headers=self._external_headers,
            )
            if response.status_code != 200:
                return None
            data = _decode_json_response(response, _WikidataClaimsResponse)
            image_claims = data.claims.get("P18", [])
            if not image_claims:
                return None
            first_claim = image_claims[0]
            filename = (
                first_claim.mainsnak.datavalue.value
                if first_claim.mainsnak and first_claim.mainsnak.datavalue
                else None
            )
            if not filename:
                return None
            commons_api = (
                f"https://commons.wikimedia.org/w/api.php"
                f"?action=query&titles=File:{quote(filename)}"
                f"&prop=imageinfo&iiprop=url&format=json"
            )
            if size:
                commons_api += f"&iiurlwidth={size}"
            commons_response = await self._http_get(
                commons_api,
                priority,
                source="wikimedia",
                headers=self._external_headers,
            )
            if commons_response.status_code != 200:
                return None
            commons_data = _decode_json_response(commons_response, _CommonsQueryResponse)
            pages = commons_data.query.pages if commons_data.query else {}
            image_url = None
            for page in pages.values():
                imageinfo = page.imageinfo
                if imageinfo:
                    if size and imageinfo[0].thumburl:
                        image_url = imageinfo[0].thumburl
                    else:
                        image_url = imageinfo[0].url
                    break
            if not image_url:
                return None
            response = await self._http_get(
                image_url,
                priority,
                source="wikimedia",
                headers=self._external_headers,
            )
            if response.status_code == 200:
                content_type = response.headers.get("content-type", "")
                if not _is_valid_image_content_type(content_type):
                    logger.warning(f"[IMG:Wikidata] Non-image content-type ({content_type})")
                    return None
                content = response.content
                task = asyncio.create_task(
                    self._write_disk_cache(
                        file_path,
                        content,
                        content_type,
                        {"wikidata_id": wikidata_id, "source": "wikidata"},
                    )
                )
                task.add_done_callback(_log_task_error)
                return (content, content_type, "wikidata")
        except TRANSIENT_FETCH_EXCEPTIONS:
            raise
        except Exception as e:  # noqa: BLE001
            logger.error(f"Error fetching artist image for {artist_id}: {e}")
        return None

    async def _lookup_wikidata_url(self, artist_id: str) -> str | None:
        logger.info(f"[IMG:Wikidata] Looking up wikidata URL for {artist_id[:8]}...")
        if not self._mb_repo:
            logger.warning(f"[IMG:Wikidata] MusicBrainz repository not available for {artist_id}")
            return None
        try:
            artist_data = await self._mb_repo.get_artist_relations(artist_id)
            if not artist_data:
                logger.info(f"[IMG:Wikidata] No artist data from MB for {artist_id[:8]}")
                return None
            url_relations = artist_data.get("relations", [])
            if url_relations:
                for url_rel in url_relations:
                    if isinstance(url_rel, dict):
                        typ = url_rel.get("type") or url_rel.get("link_type")
                        url_obj = url_rel.get("url", {})
                        target = url_obj.get("resource", "") if isinstance(url_obj, dict) else ""
                        if typ == "wikidata" and target:
                            logger.info(f"[IMG:Wikidata] Found URL for {artist_id[:8]}: {target}")
                            return target
            external_links = artist_data.get("external_links") or artist_data.get("external_links_list")
            if external_links:
                for ext in external_links:
                    try:
                        ext_type = getattr(ext, "type", None) if not isinstance(ext, dict) else ext.get("type")
                        ext_url = getattr(ext, "url", None) if not isinstance(ext, dict) else ext.get("url")
                    except (AttributeError, TypeError):
                        ext_type = None
                        ext_url = None
                    if ext_type == "wikidata" and ext_url:
                        return ext_url
            logger.info(f"[IMG:Wikidata] No wikidata link found for {artist_id[:8]}")
            return None
        except TRANSIENT_FETCH_EXCEPTIONS:
            raise
        except Exception as e:  # noqa: BLE001
            logger.error(f"[IMG:Wikidata] Failed to fetch artist metadata for {artist_id}: {e}")
            return None