dev: bump to 0.1.72-dev – HDFilme Neufassung (BeautifulSoup, korrekte Selektoren, Genres, Metadaten)

2026-03-04 23:07:44 +01:00
parent 58da715723
commit 957a5a1aea
2 changed files with 260 additions and 129 deletions
--- a/addon/plugins/hdfilme_plugin.py
+++ b/addon/plugins/hdfilme_plugin.py
@@ -1,9 +1,7 @@
 """HDFilme Plugin für ViewIT.

-HTML-Scraping von hdfilme.garden.
+HTML-Scraping von hdfilme-tv.cc (ehemals hdfilme.garden).
 Filme und Serien, Hoster-Auflösung via ResolveURL.
-
-Hinweis: Die Domain ändert sich gelegentlich – als DOMAIN-Konstante konfigurierbar.
 """

 from __future__ import annotations
@@ -14,8 +12,10 @@ from urllib.parse import quote_plus

 try:  # pragma: no cover
    import requests
+    from bs4 import BeautifulSoup
 except ImportError as exc:  # pragma: no cover
    requests = None
+    BeautifulSoup = None
    REQUESTS_AVAILABLE = False
    REQUESTS_IMPORT_ERROR = exc
 else:
@@ -28,51 +28,100 @@ from plugin_interface import BasisPlugin
 # Konstanten
 # ---------------------------------------------------------------------------

-DOMAIN = "hdfilme.garden"
-BASE_URL = "https://" + DOMAIN
+BASE_URL = "https://hdfilme-tv.cc"
 DEFAULT_TIMEOUT = 20

 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
-    "Referer": BASE_URL + "/",
+    "Connection": "keep-alive",
 }

-_URL_SEARCH = BASE_URL + "/index.php?do=search&subaction=search&story={query}"
+_URL_SEARCH = BASE_URL + "/?s={query}"
 _URL_NEW    = BASE_URL + "/kinofilme-online/"
 _URL_SERIES = BASE_URL + "/serienstream-deutsch/"

-# HTML-Parsing-Muster
-_RE_ENTRIES = re.compile(
-    r'<div class="box-product.*?href="([^"]+)[^>]*>([^<]+).*?data-src="([^"]+)',
-    re.DOTALL,
-)
-_RE_EPISODES = re.compile(r'><a href="#">([^<]+)')
-_RE_HOSTERS  = re.compile(r'link="([^"]+)"')
-_RE_THUMB_STANDALONE = re.compile(r'data-src="([^"]+)"')
+# Genre-Slug → URL-Pfad
+GENRE_SLUGS: dict[str, str] = {
+    "Abenteuer":     "abenteuer",
+    "Action":        "action",
+    "Animation":     "animation",
+    "Biographie":    "biographie",
+    "Dokumentation": "dokumentation",
+    "Drama":         "drama",
+    "Erotik":        "erotikfilme",
+    "Familie":       "familie",
+    "Fantasy":       "fantasy",
+    "Historienfilm": "historien",
+    "Horror":        "horror",
+    "Komödie":       "komodie",
+    "Krieg":         "krieg",
+    "Krimi":         "krimi",
+    "Musikfilm":     "musikfilme",
+    "Mystery":       "mystery",
+    "Romantik":      "romantik",
+    "Sci-Fi":        "sci-fi",
+    "Sport":         "sport",
+    "Thriller":      "thriller",
+    "Western":       "western",
+}

-_SKIP_HOSTERS = {"youtube", "dropload"}
+# Hoster die übersprungen werden (kein Stream / nur Trailer)
+_SKIP_LINK_KEYWORDS = ("youtube.com", "youtu.be")

 ProgressCallback = Optional[Callable[[str, Optional[int]], Any]]


+# ---------------------------------------------------------------------------
+# Hilfsfunktionen
+# ---------------------------------------------------------------------------
+
+def _absolute_url(url: str) -> str:
+    """Macht eine relative oder protokoll-relative URL absolut."""
+    url = (url or "").strip()
+    if url.startswith("//"):
+        return "https:" + url
+    if url.startswith("/"):
+        return BASE_URL + url
+    return url
+
+
+def _clean_title(raw: str) -> str:
+    """Bereinigt einen Rohtitel von Seiten-Suffixen."""
+    title = (raw or "").strip()
+    for suffix in (" stream", " Stream", " kostenlos", " Deutsch", " German", " online"):
+        if title.endswith(suffix):
+            title = title[: -len(suffix)].strip()
+    return title
+
+
+def _get_soup(url: str) -> Any:
+    """HTTP-GET und BeautifulSoup-Parsing. Gibt None bei Fehler."""
+    if requests is None or BeautifulSoup is None:
+        return None
+    try:
+        response = requests.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
+        response.raise_for_status()
+        return BeautifulSoup(response.text, "html.parser")
+    except Exception:
+        return None
+
+
 # ---------------------------------------------------------------------------
 # Plugin-Klasse
 # ---------------------------------------------------------------------------

-class HDFilmePlugin(BasisPlugin):
-    """HDFilme Integration für ViewIT (hdfilme.garden)."""
+class HdfilmePlugin(BasisPlugin):
+    """HDFilme Integration für ViewIT. HTML-Scraping via BeautifulSoup."""

    name = "HDFilme"

    def __init__(self) -> None:
-        # title → Detail-Page-URL
        self._title_to_url: dict[str, str] = {}
-        # title → (plot, poster, fanart)
-        self._title_meta: dict[str, tuple[str, str, str]] = {}
-        # title → True wenn Serie
        self._is_series: dict[str, bool] = {}
+        self._title_meta: dict[str, tuple[str, str]] = {}  # title → (plot, poster)
+        self._episode_cache: dict[str, list[str]] = {}      # detail_url → episode labels

    # ------------------------------------------------------------------
    # Verfügbarkeit
@@ -86,154 +135,195 @@ class HDFilmePlugin(BasisPlugin):
    def unavailable_reason(self) -> str:
        if REQUESTS_AVAILABLE:
            return ""
-        return f"requests nicht verfügbar: {REQUESTS_IMPORT_ERROR}"
+        return f"requests/bs4 nicht verfügbar: {REQUESTS_IMPORT_ERROR}"

    # ------------------------------------------------------------------
-    # HTTP
+    # Internes Parsing
    # ------------------------------------------------------------------

-    def _get_session(self):  # type: ignore[return]
-        from http_session_pool import get_requests_session
-        return get_requests_session("hdfilme", headers=HEADERS)
-
-    def _get_html(self, url: str) -> str:
-        session = self._get_session()
-        response = None
-        try:
-            response = session.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
-            response.raise_for_status()
-            return response.text
-        except Exception:
-            return ""
-        finally:
-            if response is not None:
-                try:
-                    response.close()
-                except Exception:
-                    pass
-
-    # ------------------------------------------------------------------
-    # Interne Hilfsmethoden
-    # ------------------------------------------------------------------
-
-    def _parse_entries(self, html: str) -> List[str]:
-        """Parst Ergebnisseite und cached Einträge. Gibt Titelliste zurück."""
+    def _parse_entries(self, soup: Any) -> list[str]:
+        """Parst eine Listing-Seite und gibt Titel zurück (cached)."""
+        if soup is None:
+            return []
        titles: list[str] = []
-        for m in _RE_ENTRIES.finditer(html):
-            raw_url, raw_title, raw_thumb = m.group(1), m.group(2), m.group(3)
-            title = raw_title.strip()
-            if not title:
+        seen: set[str] = set()
+        for box in soup.select("div.box-product"):
+            # URL aus erstem Link
+            link = box.find("a", href=True)
+            if not link:
+                continue
+            url = _absolute_url(link["href"])
+            if not url.endswith(".html"):
                continue

-            # Absolute URL sicherstellen
-            url = raw_url.strip()
-            if url.startswith("/"):
-                url = BASE_URL + url
-            if not url.startswith("http"):
+            # Titel aus h3
+            h3_a = box.select_one("h3 a")
+            if not h3_a:
                continue
+            raw_title = h3_a.get_text(strip=True)
+            title = _clean_title(raw_title)
+            if not title or title in seen:
+                continue
+            seen.add(title)

-            thumb = raw_thumb.strip()
-            if thumb.startswith("/"):
-                thumb = BASE_URL + thumb
+            # Thumbnail
+            img = box.select_one("img.lazyload")
+            poster = ""
+            if img and img.get("data-src"):
+                poster = _absolute_url(img["data-src"])
+
+            # Serien-Erkennung via Titel
+            is_series = bool(re.search(r"\bStaffel\b|\bSeason\b", raw_title, re.I))

-            is_series = "taffel" in title  # "Staffel" (xStream-Konvention)
            self._title_to_url[title] = url
            self._is_series[title] = is_series
-            self._title_meta[title] = ("", thumb, "")
+            if poster:
+                self._title_meta[title] = ("", poster)
            titles.append(title)
        return titles

-    def _get_hoster_links(self, html: str, episode: str = "") -> List[str]:
-        """Extrahiert Hoster-URLs aus HTML, optional nach Episode gefiltert."""
-        search_area = html
-        if episode:
-            # Episode-Abschnitt isolieren
-            m = re.search(re.escape(episode) + r"<.*?</ul>", html, re.DOTALL)
-            if m:
-                search_area = m.group(0)
+    def _get_detail_soup(self, title: str) -> Any:
+        """Lädt die Detailseite eines Titels."""
+        url = self._title_to_url.get(title, "")
+        if not url:
+            return None
+        return _get_soup(url)

+    def _extract_hoster_links(self, soup: Any, episode_id: str = "") -> list[str]:
+        """Extrahiert Hoster-Links aus einer Detailseite.
+
+        episode_id: wenn gesetzt, nur Links aus dem `<li id="{episode_id}">` Block.
+        """
+        if soup is None:
+            return []
        links: list[str] = []
-        for m in _RE_HOSTERS.finditer(search_area):
-            link = m.group(1).strip()
-            if not link:
+
+        if episode_id:
+            # Serien-Episode: Links aus dem spezifischen Episode-Container
+            container = soup.select_one(f"li#{episode_id}")
+            if container is None:
+                return []
+            candidates = container.select("a[data-link]")
+        else:
+            # Film: Links aus .mirrors
+            candidates = soup.select(".mirrors [data-link]")
+
+        for el in candidates:
+            href = _absolute_url((el.get("data-link") or "").strip())
+            if not href:
                continue
-            if link.startswith("//"):
-                link = "https:" + link
-            name = link.split("//")[-1].split(".")[0].lower()
-            if name in _SKIP_HOSTERS:
+            if any(kw in href for kw in _SKIP_LINK_KEYWORDS):
                continue
-            links.append(link)
+            links.append(href)
        return links

+    def _staffel_nr(self, season: str) -> int:
+        """Extrahiert die Staffelnummer aus einem Label wie 'Staffel 2'."""
+        m = re.search(r"\d+", season or "")
+        return int(m.group()) if m else 1
+
+    def _ep_index(self, episode: str) -> int:
+        """Extrahiert den Episode-Index aus einem Label wie 'Episode 3'."""
+        m = re.search(r"\d+", episode or "")
+        return int(m.group()) if m else 1
+
    # ------------------------------------------------------------------
    # Pflicht-Methoden
    # ------------------------------------------------------------------

    async def search_titles(
-        self, query: str, progress_callback: ProgressCallback = None
+        self,
+        query: str,
+        progress_callback: ProgressCallback = None,
    ) -> List[str]:
-        query = (query or "").strip()
        if not query or not REQUESTS_AVAILABLE:
            return []
-        url = _URL_SEARCH.format(query=quote_plus(query))
-        html = self._get_html(url)
-        if not html:
-            return []
-        # Suche filtert clientseitig nach Titel
-        q_lower = query.lower()
-        all_titles = self._parse_entries(html)
-        return [t for t in all_titles if q_lower in t.lower()]
+        url = _URL_SEARCH.format(query=quote_plus(query.strip()))
+        soup = _get_soup(url)
+        return self._parse_entries(soup)

    def seasons_for(self, title: str) -> List[str]:
        title = (title or "").strip()
        if not title:
            return []
-        if self._is_series.get(title):
-            # Staffelnummer aus Titel ableiten, falls vorhanden
-            m = re.search(r"Staffel\s*(\d+)", title, re.IGNORECASE)
-            if m:
-                return [f"Staffel {m.group(1)}"]
-            return ["Staffel 1"]
+        if self._is_series.get(title) is False:
+            return ["Film"]
+        if self._is_series.get(title) is True:
+            m = re.search(r"Staffel\s*(\d+)|Season\s*(\d+)", title, re.I)
+            nr = int(m.group(1) or m.group(2)) if m else 1
+            return [f"Staffel {nr}"]
+        # Unbekannt: Detailseite laden und prüfen
+        soup = self._get_detail_soup(title)
+        if soup and soup.select_one("div.series"):
+            self._is_series[title] = True
+            m = re.search(r"Staffel\s*(\d+)|Season\s*(\d+)", title, re.I)
+            nr = int(m.group(1) or m.group(2)) if m else 1
+            return [f"Staffel {nr}"]
+        self._is_series[title] = False
        return ["Film"]

    def episodes_for(self, title: str, season: str) -> List[str]:
        title = (title or "").strip()
+        season = (season or "").strip()
        if not title:
            return []
-
        if season == "Film":
            return [title]

-        url = self._title_to_url.get(title, "")
-        if not url:
-            return []
+        detail_url = self._title_to_url.get(title, "")
+        cached = self._episode_cache.get(detail_url)
+        if cached is not None:
+            return cached

-        html = self._get_html(url)
-        if not html:
+        staffel_nr = self._staffel_nr(season)
+        soup = self._get_detail_soup(title)
+        if soup is None:
            return [title]

-        episodes = _RE_EPISODES.findall(html)
-        return [ep.strip() for ep in episodes if ep.strip()] or [title]
+        # li IDs: "serie-{staffel}_{episode}"
+        pattern = f"serie-{staffel_nr}_"
+        episode_items = [li for li in soup.select("li[id]") if li.get("id", "").startswith(pattern)]

-    # ------------------------------------------------------------------
-    # Stream
-    # ------------------------------------------------------------------
+        labels: list[str] = []
+        for li in episode_items:
+            ep_id = li.get("id", "")  # z.B. "serie-1_3"
+            ep_num_str = ep_id.split("_")[-1]
+            # Episodentitel aus erstem <a href="#">
+            a = li.find("a", href="#")
+            if a:
+                raw = a.get_text(strip=True)
+                # "Episoden 3" → "Episode 3"
+                ep_label = re.sub(r"^Episoden?\s*", "", raw, flags=re.I).strip()
+                label = f"Episode {ep_label}" if ep_label else f"Episode {ep_num_str}"
+            else:
+                label = f"Episode {ep_num_str}"
+            labels.append(label)

-    def stream_link_for(
-        self, title: str, season: str, episode: str
-    ) -> Optional[str]:
+        result = labels if labels else [title]
+        if detail_url:
+            self._episode_cache[detail_url] = result
+        return result
+
+    def stream_link_for(self, title: str, season: str, episode: str) -> Optional[str]:
        title = (title or "").strip()
-        url = self._title_to_url.get(title, "")
-        if not url:
+        season = (season or "").strip()
+        if not title:
            return None

-        html = self._get_html(url)
-        if not html:
+        soup = self._get_detail_soup(title)
+        if soup is None:
            return None

-        # Für Serien: nach Episode-Abschnitt filtern (wenn episode != title)
-        ep_filter = "" if (season == "Film" or episode == title) else episode
-        links = self._get_hoster_links(html, ep_filter)
+        if season == "Film" or not self._is_series.get(title, False):
+            # Film: .mirrors [data-link]
+            links = self._extract_hoster_links(soup)
+        else:
+            # Serie: Episode-Container
+            staffel_nr = self._staffel_nr(season)
+            ep_idx = self._ep_index(episode)
+            episode_id = f"serie-{staffel_nr}_{ep_idx}"
+            links = self._extract_hoster_links(soup, episode_id)
+
        return links[0] if links else None

    def resolve_stream_link(self, link: str) -> Optional[str]:
@@ -252,7 +342,7 @@ class HDFilmePlugin(BasisPlugin):

    def metadata_for(
        self, title: str
-    ) -> tuple[dict[str, str], dict[str, str], list | None]:
+    ) -> tuple[dict[str, str], dict[str, str], list[object] | None]:
        title = (title or "").strip()
        if not title:
            return {}, {}, None
@@ -260,17 +350,40 @@ class HDFilmePlugin(BasisPlugin):
        info: dict[str, str] = {"title": title}
        art: dict[str, str] = {}

+        # Cache-Hit
        cached = self._title_meta.get(title)
        if cached:
-            plot, poster, fanart = cached
+            plot, poster = cached
            if plot:
                info["plot"] = plot
            if poster:
-                art["thumb"] = poster
-                art["poster"] = poster
-            if fanart:
-                art["fanart"] = fanart
+                art["thumb"] = art["poster"] = poster
+            if info or art:
+                return info, art, None

+        # Detailseite laden
+        soup = self._get_detail_soup(title)
+        if soup is None:
+            return info, art, None
+
+        og_desc = soup.find("meta", attrs={"property": "og:description"})
+        if og_desc and og_desc.get("content"):
+            info["plot"] = og_desc["content"].strip()
+
+        og_img = soup.find("meta", attrs={"property": "og:image"})
+        poster = ""
+        if og_img and og_img.get("content"):
+            poster = _absolute_url(og_img["content"].strip())
+            art["thumb"] = art["poster"] = poster
+
+        # Jahr aus Textabschnitt "Titel YYYY"
+        year_el = soup.select_one("p.text-capitalize")
+        if year_el:
+            m = re.search(r"\b(19|20)\d{2}\b", year_el.get_text())
+            if m:
+                info["year"] = m.group()
+
+        self._title_meta[title] = (info.get("plot", ""), poster)
        return info, art, None

    # ------------------------------------------------------------------
@@ -278,12 +391,30 @@ class HDFilmePlugin(BasisPlugin):
    # ------------------------------------------------------------------

    def latest_titles(self, page: int = 1) -> List[str]:
-        html = self._get_html(_URL_NEW)
-        return self._parse_entries(html) if html else []
+        if not REQUESTS_AVAILABLE:
+            return []
+        page = max(1, int(page or 1))
+        url = _URL_NEW if page == 1 else f"{_URL_NEW}page/{page}/"
+        return self._parse_entries(_get_soup(url))

    def popular_series(self) -> List[str]:
-        html = self._get_html(_URL_SERIES)
-        return self._parse_entries(html) if html else []
+        if not REQUESTS_AVAILABLE:
+            return []
+        return self._parse_entries(_get_soup(_URL_SERIES))
+
+    def genres(self) -> List[str]:
+        return sorted(GENRE_SLUGS.keys())
+
+    def titles_for_genre(self, genre: str) -> List[str]:
+        return self.titles_for_genre_page(genre, 1)
+
+    def titles_for_genre_page(self, genre: str, page: int = 1) -> List[str]:
+        slug = GENRE_SLUGS.get(genre, "")
+        if not slug or not REQUESTS_AVAILABLE:
+            return []
+        page = max(1, int(page or 1))
+        url = f"{BASE_URL}/{slug}/" if page == 1 else f"{BASE_URL}/{slug}/page/{page}/"
+        return self._parse_entries(_get_soup(url))

    def capabilities(self) -> set[str]:
-        return {"latest_titles", "popular_series"}
+        return {"latest_titles", "popular_series", "genres"}