ViewIT/addon/plugins/hdfilme_plugin.py

"""HDFilme Plugin für ViewIT.

HTML-Scraping von hdfilme-tv.cc (ehemals hdfilme.garden).
Filme und Serien, Hoster-Auflösung via ResolveURL.
"""

from __future__ import annotations

import re
from typing import Any, Callable, List, Optional
from urllib.parse import quote_plus

try:  # pragma: no cover
    import requests
    from bs4 import BeautifulSoup
except ImportError as exc:  # pragma: no cover
    requests = None
    BeautifulSoup = None
    REQUESTS_AVAILABLE = False
    REQUESTS_IMPORT_ERROR = exc
else:
    REQUESTS_AVAILABLE = True
    REQUESTS_IMPORT_ERROR = None

from plugin_interface import BasisPlugin

# ---------------------------------------------------------------------------
# Konstanten
# ---------------------------------------------------------------------------

BASE_URL = "https://hdfilme-tv.cc"
DEFAULT_TIMEOUT = 20

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
    "Connection": "keep-alive",
}

_URL_SEARCH = BASE_URL + "/?do=search&subaction=search&story={query}"
_URL_NEW    = BASE_URL + "/kinofilme-online/"
_URL_SERIES = BASE_URL + "/serienstream-deutsch/"

# Genre-Slug → URL-Pfad
GENRE_SLUGS: dict[str, str] = {
    "Abenteuer":     "abenteuer",
    "Action":        "action",
    "Animation":     "animation",
    "Biographie":    "biographie",
    "Dokumentation": "dokumentation",
    "Drama":         "drama",
    "Erotik":        "erotikfilme",
    "Familie":       "familie",
    "Fantasy":       "fantasy",
    "Historienfilm": "historien",
    "Horror":        "horror",
    "Komödie":       "komodie",
    "Krieg":         "krieg",
    "Krimi":         "krimi",
    "Musikfilm":     "musikfilme",
    "Mystery":       "mystery",
    "Romantik":      "romantik",
    "Sci-Fi":        "sci-fi",
    "Sport":         "sport",
    "Thriller":      "thriller",
    "Western":       "western",
}

# Hoster die übersprungen werden (kein Stream / nur Trailer)
_SKIP_LINK_KEYWORDS = ("youtube.com", "youtu.be", "hdfilme-tv.cc")

ProgressCallback = Optional[Callable[[str, Optional[int]], Any]]


# ---------------------------------------------------------------------------
# Hilfsfunktionen
# ---------------------------------------------------------------------------

def _absolute_url(url: str) -> str:
    """Macht eine relative oder protokoll-relative URL absolut."""
    url = (url or "").strip()
    if url.startswith("//"):
        return "https:" + url
    if url.startswith("/"):
        return BASE_URL + url
    return url


def _clean_title(raw: str) -> str:
    """Bereinigt einen Rohtitel von Seiten-Suffixen."""
    title = (raw or "").strip()
    for suffix in (" stream", " Stream", " kostenlos", " Deutsch", " German", " online"):
        if title.endswith(suffix):
            title = title[: -len(suffix)].strip()
    return title


def _get_soup(url: str) -> Any:
    """HTTP-GET und BeautifulSoup-Parsing. Gibt None bei Fehler."""
    if requests is None or BeautifulSoup is None:
        return None
    try:
        response = requests.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except Exception:
        return None


# ---------------------------------------------------------------------------
# Plugin-Klasse
# ---------------------------------------------------------------------------

class HdfilmePlugin(BasisPlugin):
    """HDFilme Integration für ViewIT. HTML-Scraping via BeautifulSoup."""

    name = "HDFilme"

    def __init__(self) -> None:
        self._title_to_url: dict[str, str] = {}
        self._is_series: dict[str, bool] = {}
        self._title_meta: dict[str, tuple[str, str]] = {}  # title → (plot, poster)
        self._episode_cache: dict[str, list[str]] = {}      # detail_url → episode labels
        self._preferred_hosters: list[str] = []

    # ------------------------------------------------------------------
    # Verfügbarkeit
    # ------------------------------------------------------------------

    @property
    def is_available(self) -> bool:
        return REQUESTS_AVAILABLE

    @property
    def unavailable_reason(self) -> str:
        if REQUESTS_AVAILABLE:
            return ""
        return f"requests/bs4 nicht verfügbar: {REQUESTS_IMPORT_ERROR}"

    # ------------------------------------------------------------------
    # Internes Parsing
    # ------------------------------------------------------------------

    def _parse_entries(self, soup: Any) -> list[str]:
        """Parst eine Listing-Seite und gibt Titel zurück (cached)."""
        if soup is None:
            return []
        titles: list[str] = []
        seen: set[str] = set()
        for box in soup.select("div.box-product"):
            # URL aus erstem Link
            link = box.find("a", href=True)
            if not link:
                continue
            url = _absolute_url(link["href"])
            if not url.endswith(".html"):
                continue

            # Titel aus h3
            h3_a = box.select_one("h3 a")
            if not h3_a:
                continue
            raw_title = h3_a.get_text(strip=True)
            title = _clean_title(raw_title)
            if not title or title in seen:
                continue
            seen.add(title)

            # Thumbnail
            img = box.select_one("img.lazyload")
            poster = ""
            if img and img.get("data-src"):
                poster = _absolute_url(img["data-src"])

            # Serien-Erkennung via Titel
            is_series = bool(re.search(r"\bStaffel\b|\bSeason\b", raw_title, re.I))

            self._title_to_url[title] = url
            self._is_series[title] = is_series
            if poster:
                self._title_meta[title] = ("", poster)
            titles.append(title)
        return titles

    def _ensure_detail_url(self, title: str) -> str:
        """Gibt die Detail-URL für einen Titel zurück.

        Sucht zuerst im Cache, dann live über die Suchfunktion.
        """
        url = self._title_to_url.get(title, "")
        if url:
            return url
        # Fallback: Live-Suche (nötig wenn Plugin-Instanz neu, Cache leer)
        search_url = _URL_SEARCH.format(query=quote_plus(title.strip()))
        soup = _get_soup(search_url)
        if soup:
            self._parse_entries(soup)
            url = self._title_to_url.get(title, "")
        return url

    def _get_detail_soup(self, title: str) -> Any:
        """Lädt die Detailseite eines Titels."""
        url = self._ensure_detail_url(title)
        if not url:
            return None
        return _get_soup(url)

    def _extract_hoster_links(self, soup: Any, episode_id: str = "") -> dict[str, str]:
        """Extrahiert Hoster-Links aus einer Detailseite.

        Gibt dict {Hoster-Name → URL} zurück.
        episode_id: wenn gesetzt, nur Links aus dem `<li id="{episode_id}">` Block.
        """
        if soup is None:
            return {}
        hosters: dict[str, str] = {}

        if episode_id:
            container = soup.select_one(f"li#{episode_id}")
            if container is None:
                return {}
            candidates = container.select("a[data-link]")
        else:
            candidates = soup.select(".mirrors [data-link]")

        seen_names: set[str] = set()
        for el in candidates:
            href = _absolute_url((el.get("data-link") or "").strip())
            if not href:
                continue
            if any(kw in href for kw in _SKIP_LINK_KEYWORDS):
                continue
            name = el.get_text(strip=True) or "Hoster"
            # Eindeutiger Name bei Duplikaten
            base_name = name
            i = 2
            while name in seen_names:
                name = f"{base_name} {i}"
                i += 1
            seen_names.add(name)
            hosters[name] = href
        return hosters

    def _staffel_nr(self, season: str) -> int:
        """Extrahiert die Staffelnummer aus einem Label wie 'Staffel 2'."""
        m = re.search(r"\d+", season or "")
        return int(m.group()) if m else 1

    def _ep_index(self, episode: str) -> int:
        """Extrahiert den Episode-Index aus einem Label wie 'Episode 3'."""
        m = re.search(r"\d+", episode or "")
        return int(m.group()) if m else 1

    # ------------------------------------------------------------------
    # Pflicht-Methoden
    # ------------------------------------------------------------------

    async def search_titles(
        self,
        query: str,
        progress_callback: ProgressCallback = None,
    ) -> List[str]:
        if not query or not REQUESTS_AVAILABLE:
            return []
        url = _URL_SEARCH.format(query=quote_plus(query.strip()))
        soup = _get_soup(url)
        return self._parse_entries(soup)

    def seasons_for(self, title: str) -> List[str]:
        title = (title or "").strip()
        if not title:
            return []
        if self._is_series.get(title) is False:
            return ["Film"]
        if self._is_series.get(title) is True:
            m = re.search(r"Staffel\s*(\d+)|Season\s*(\d+)", title, re.I)
            nr = int(m.group(1) or m.group(2)) if m else 1
            return [f"Staffel {nr}"]
        # Unbekannt: Detailseite laden und prüfen
        soup = self._get_detail_soup(title)
        if soup and soup.select_one("div.series"):
            self._is_series[title] = True
            m = re.search(r"Staffel\s*(\d+)|Season\s*(\d+)", title, re.I)
            nr = int(m.group(1) or m.group(2)) if m else 1
            return [f"Staffel {nr}"]
        self._is_series[title] = False
        return ["Film"]

    def episodes_for(self, title: str, season: str) -> List[str]:
        title = (title or "").strip()
        season = (season or "").strip()
        if not title:
            return []
        if season == "Film":
            return [title]

        detail_url = self._ensure_detail_url(title)
        cached = self._episode_cache.get(detail_url)
        if cached is not None:
            return cached

        staffel_nr = self._staffel_nr(season)
        soup = self._get_detail_soup(title)
        if soup is None:
            return [title]

        # li IDs: "serie-{staffel}_{episode}"
        pattern = f"serie-{staffel_nr}_"
        episode_items = [li for li in soup.select("li[id]") if li.get("id", "").startswith(pattern)]

        labels: list[str] = []
        for li in episode_items:
            ep_id = li.get("id", "")  # z.B. "serie-1_3"
            ep_num_str = ep_id.split("_")[-1]
            # Episodentitel aus erstem <a href="#">
            a = li.find("a", href="#")
            if a:
                raw = a.get_text(strip=True)
                # "Episoden 3" → "Episode 3"
                ep_label = re.sub(r"^Episoden?\s*", "", raw, flags=re.I).strip()
                label = f"Episode {ep_label}" if ep_label else f"Episode {ep_num_str}"
            else:
                label = f"Episode {ep_num_str}"
            labels.append(label)

        result = labels if labels else [title]
        if detail_url:
            self._episode_cache[detail_url] = result
        return result

    def _hosters_for(self, title: str, season: str, episode: str) -> dict[str, str]:
        """Gibt alle verfügbaren Hoster {Name → URL} für Titel/Staffel/Episode zurück."""
        soup = self._get_detail_soup(title)
        if soup is None:
            return {}
        if season == "Film" or not self._is_series.get(title, False):
            return self._extract_hoster_links(soup)
        staffel_nr = self._staffel_nr(season)
        ep_idx = self._ep_index(episode)
        episode_id = f"serie-{staffel_nr}_{ep_idx}"
        return self._extract_hoster_links(soup, episode_id)

    def available_hosters_for(self, title: str, season: str, episode: str) -> List[str]:
        return list(self._hosters_for(title, season, episode).keys())

    def set_preferred_hosters(self, hosters: List[str]) -> None:
        self._preferred_hosters = [h for h in hosters if h]

    def stream_link_for(self, title: str, season: str, episode: str) -> Optional[str]:
        title = (title or "").strip()
        season = (season or "").strip()
        if not title:
            return None
        hosters = self._hosters_for(title, season, episode)
        if not hosters:
            return None
        # Bevorzugten Hoster nutzen falls gesetzt
        for preferred in self._preferred_hosters:
            key = preferred.casefold()
            for name, url in hosters.items():
                if key in name.casefold() or key in url.casefold():
                    return url
        # Fallback: erster Hoster
        return next(iter(hosters.values()))

    def resolve_stream_link(self, link: str) -> Optional[str]:
        link = (link or "").strip()
        if not link:
            return None
        try:
            from plugin_helpers import resolve_via_resolveurl
            return resolve_via_resolveurl(link, fallback_to_link=False)
        except Exception:
            return None

    # ------------------------------------------------------------------
    # Metadaten
    # ------------------------------------------------------------------

    def metadata_for(
        self, title: str
    ) -> tuple[dict[str, str], dict[str, str], list[object] | None]:
        title = (title or "").strip()
        if not title:
            return {}, {}, None

        info: dict[str, str] = {"title": title}
        art: dict[str, str] = {}

        # Cache-Hit – nur zurückgeben wenn Plot vorhanden (sonst Detailseite laden)
        cached = self._title_meta.get(title)
        if cached:
            plot, poster = cached
            if plot:
                info["plot"] = plot
            if poster:
                art["thumb"] = art["poster"] = poster
            if plot:
                return info, art, None

        # Detailseite laden
        soup = self._get_detail_soup(title)
        if soup is None:
            return info, art, None

        og_desc = soup.find("meta", attrs={"property": "og:description"})
        if og_desc and og_desc.get("content"):
            info["plot"] = og_desc["content"].strip()

        og_img = soup.find("meta", attrs={"property": "og:image"})
        poster = ""
        if og_img and og_img.get("content"):
            poster = _absolute_url(og_img["content"].strip())
            art["thumb"] = art["poster"] = poster

        # Jahr aus Textabschnitt "Titel YYYY"
        year_el = soup.select_one("p.text-capitalize")
        if year_el:
            m = re.search(r"\b(19|20)\d{2}\b", year_el.get_text())
            if m:
                info["year"] = m.group()

        self._title_meta[title] = (info.get("plot", ""), poster)
        return info, art, None

    # ------------------------------------------------------------------
    # Browsing
    # ------------------------------------------------------------------

    def new_titles(self) -> List[str]:
        if not REQUESTS_AVAILABLE:
            return []
        return self._parse_entries(_get_soup(_URL_NEW))

    def new_titles_page(self, page: int = 1) -> List[str]:
        if not REQUESTS_AVAILABLE:
            return []
        page = max(1, int(page or 1))
        url = _URL_NEW if page == 1 else f"{_URL_NEW}page/{page}/"
        return self._parse_entries(_get_soup(url))

    def popular_series(self) -> List[str]:
        if not REQUESTS_AVAILABLE:
            return []
        return self._parse_entries(_get_soup(_URL_SERIES))

    def genres(self) -> List[str]:
        return sorted(GENRE_SLUGS.keys())

    def titles_for_genre(self, genre: str) -> List[str]:
        return self.titles_for_genre_page(genre, 1)

    def titles_for_genre_page(self, genre: str, page: int = 1) -> List[str]:
        slug = GENRE_SLUGS.get(genre, "")
        if not slug or not REQUESTS_AVAILABLE:
            return []
        page = max(1, int(page or 1))
        url = f"{BASE_URL}/{slug}/" if page == 1 else f"{BASE_URL}/{slug}/page/{page}/"
        return self._parse_entries(_get_soup(url))

    def capabilities(self) -> set[str]:
        return {"new_titles", "popular_series", "genres"}