Add Doku-Streams plugin and prefer source metadata

2026-02-07 16:11:48 +01:00
parent c7d848385f
commit 6ce1bf71c1
6 changed files with 829 additions and 83 deletions
--- a/addon/plugins/dokustreams_plugin.py
+++ b/addon/plugins/dokustreams_plugin.py
@@ -0,0 +1,476 @@
+"""Doku-Streams (doku-streams.com) Integration."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import re
+from urllib.parse import quote
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeAlias
+
+try:  # pragma: no cover - optional dependency
+    import requests
+    from bs4 import BeautifulSoup  # type: ignore[import-not-found]
+except ImportError as exc:  # pragma: no cover - optional dependency
+    requests = None
+    BeautifulSoup = None
+    REQUESTS_AVAILABLE = False
+    REQUESTS_IMPORT_ERROR = exc
+else:
+    REQUESTS_AVAILABLE = True
+    REQUESTS_IMPORT_ERROR = None
+
+from plugin_interface import BasisPlugin
+from plugin_helpers import dump_response_html, get_setting_bool, get_setting_string, log_error, log_url, notify_url
+from http_session_pool import get_requests_session
+
+if TYPE_CHECKING:  # pragma: no cover
+    from requests import Session as RequestsSession
+    from bs4 import BeautifulSoup as BeautifulSoupT  # type: ignore[import-not-found]
+else:  # pragma: no cover
+    RequestsSession: TypeAlias = Any
+    BeautifulSoupT: TypeAlias = Any
+
+
+ADDON_ID = "plugin.video.viewit"
+SETTING_BASE_URL = "doku_streams_base_url"
+DEFAULT_BASE_URL = "https://doku-streams.com"
+MOST_VIEWED_PATH = "/meistgesehene/"
+DEFAULT_TIMEOUT = 20
+GLOBAL_SETTING_LOG_URLS = "debug_log_urls"
+GLOBAL_SETTING_DUMP_HTML = "debug_dump_html"
+GLOBAL_SETTING_SHOW_URL_INFO = "debug_show_url_info"
+GLOBAL_SETTING_LOG_ERRORS = "debug_log_errors"
+SETTING_LOG_URLS = "log_urls_dokustreams"
+SETTING_DUMP_HTML = "dump_html_dokustreams"
+SETTING_SHOW_URL_INFO = "show_url_info_dokustreams"
+SETTING_LOG_ERRORS = "log_errors_dokustreams"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Kodi; ViewIt) AppleWebKit/537.36 (KHTML, like Gecko)",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
+    "Connection": "keep-alive",
+}
+
+
+@dataclass(frozen=True)
+class SearchHit:
+    title: str
+    url: str
+    plot: str = ""
+    poster: str = ""
+
+
+def _extract_last_page(soup: BeautifulSoupT) -> int:
+    max_page = 1
+    if not soup:
+        return max_page
+    for anchor in soup.select("nav.navigation a[href], nav.pagination a[href], a.page-numbers[href]"):
+        text = (anchor.get_text(" ", strip=True) or "").strip()
+        for candidate in (text, (anchor.get("href") or "").strip()):
+            for value in re.findall(r"/page/(\\d+)/", candidate):
+                try:
+                    max_page = max(max_page, int(value))
+                except Exception:
+                    continue
+            for value in re.findall(r"(\\d+)", candidate):
+                try:
+                    max_page = max(max_page, int(value))
+                except Exception:
+                    continue
+    return max_page
+
+
+def _extract_summary_and_poster(article: BeautifulSoupT) -> tuple[str, str]:
+    summary = ""
+    if article:
+        summary_box = article.select_one("div.entry-summary")
+        if summary_box is not None:
+            for p in summary_box.find_all("p"):
+                text = (p.get_text(" ", strip=True) or "").strip()
+                if text:
+                    summary = text
+                    break
+    poster = ""
+    if article:
+        img = article.select_one("div.entry-thumb img")
+        if img is not None:
+            poster = (img.get("data-src") or "").strip() or (img.get("src") or "").strip()
+            if "lazy_placeholder" in poster and img.get("data-src"):
+                poster = (img.get("data-src") or "").strip()
+            poster = _absolute_url(poster)
+    return summary, poster
+
+
+def _parse_listing_hits(soup: BeautifulSoupT, *, query: str = "") -> List[SearchHit]:
+    hits: List[SearchHit] = []
+    if not soup:
+        return hits
+    seen_titles: set[str] = set()
+    seen_urls: set[str] = set()
+    for article in soup.select("article[id^='post-']"):
+        anchor = article.select_one("h2.entry-title a[href]")
+        if anchor is None:
+            continue
+        href = (anchor.get("href") or "").strip()
+        title = (anchor.get_text(" ", strip=True) or "").strip()
+        if not href or not title:
+            continue
+        if query and not _matches_query(query, title=title):
+            continue
+        url = _absolute_url(href).split("#", 1)[0].split("?", 1)[0].rstrip("/")
+        title_key = title.casefold()
+        url_key = url.casefold()
+        if title_key in seen_titles or url_key in seen_urls:
+            continue
+        seen_titles.add(title_key)
+        seen_urls.add(url_key)
+        _log_url_event(url, kind="PARSE")
+        summary, poster = _extract_summary_and_poster(article)
+        hits.append(SearchHit(title=title, url=url, plot=summary, poster=poster))
+    return hits
+
+
+def _get_base_url() -> str:
+    base = get_setting_string(ADDON_ID, SETTING_BASE_URL, default=DEFAULT_BASE_URL).strip()
+    if not base:
+        base = DEFAULT_BASE_URL
+    return base.rstrip("/")
+
+
+def _absolute_url(url: str) -> str:
+    url = (url or "").strip()
+    if not url:
+        return ""
+    if url.startswith("http://") or url.startswith("https://"):
+        return url
+    if url.startswith("//"):
+        return f"https:{url}"
+    if url.startswith("/"):
+        return f"{_get_base_url()}{url}"
+    return f"{_get_base_url()}/{url.lstrip('/')}"
+
+
+def _normalize_search_text(value: str) -> str:
+    value = (value or "").casefold()
+    value = re.sub(r"[^a-z0-9]+", " ", value)
+    value = re.sub(r"\s+", " ", value).strip()
+    return value
+
+
+def _matches_query(query: str, *, title: str) -> bool:
+    normalized_query = _normalize_search_text(query)
+    if not normalized_query:
+        return False
+    haystack = f" {_normalize_search_text(title)} "
+    return f" {normalized_query} " in haystack
+
+
+def _log_url_event(url: str, *, kind: str = "VISIT") -> None:
+    log_url(
+        ADDON_ID,
+        enabled_setting_id=GLOBAL_SETTING_LOG_URLS,
+        plugin_setting_id=SETTING_LOG_URLS,
+        log_filename="dokustreams_urls.log",
+        url=url,
+        kind=kind,
+    )
+
+
+def _log_visit(url: str) -> None:
+    _log_url_event(url, kind="VISIT")
+    notify_url(
+        ADDON_ID,
+        heading="Doku-Streams",
+        url=url,
+        enabled_setting_id=GLOBAL_SETTING_SHOW_URL_INFO,
+        plugin_setting_id=SETTING_SHOW_URL_INFO,
+    )
+
+
+def _log_response_html(url: str, body: str) -> None:
+    dump_response_html(
+        ADDON_ID,
+        enabled_setting_id=GLOBAL_SETTING_DUMP_HTML,
+        plugin_setting_id=SETTING_DUMP_HTML,
+        url=url,
+        body=body,
+        filename_prefix="dokustreams_response",
+    )
+
+
+def _log_error_message(message: str) -> None:
+    log_error(
+        ADDON_ID,
+        enabled_setting_id=GLOBAL_SETTING_LOG_ERRORS,
+        plugin_setting_id=SETTING_LOG_ERRORS,
+        log_filename="dokustreams_errors.log",
+        message=message,
+    )
+
+
+def _get_soup(url: str, *, session: Optional[RequestsSession] = None) -> BeautifulSoupT:
+    if requests is None or BeautifulSoup is None:
+        raise RuntimeError("requests/bs4 sind nicht verfuegbar.")
+    _log_visit(url)
+    sess = session or get_requests_session("dokustreams", headers=HEADERS)
+    try:
+        response = sess.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
+        response.raise_for_status()
+    except Exception as exc:
+        _log_error_message(f"GET {url} failed: {exc}")
+        raise
+    if response.url and response.url != url:
+        _log_url_event(response.url, kind="REDIRECT")
+    _log_response_html(url, response.text)
+    return BeautifulSoup(response.text, "html.parser")
+
+
+class DokuStreamsPlugin(BasisPlugin):
+    name = "Doku-Streams"
+    version = "1.0.0"
+    prefer_source_metadata = True
+
+    def __init__(self) -> None:
+        self._title_to_url: Dict[str, str] = {}
+        self._category_to_url: Dict[str, str] = {}
+        self._category_page_count_cache: Dict[str, int] = {}
+        self._popular_cache: Optional[List[SearchHit]] = None
+        self._title_meta: Dict[str, tuple[str, str]] = {}
+        self._requests_available = REQUESTS_AVAILABLE
+        self.is_available = True
+        self.unavailable_reason: Optional[str] = None
+        if not self._requests_available:  # pragma: no cover - optional dependency
+            self.is_available = False
+            self.unavailable_reason = (
+                "requests/bs4 fehlen. Installiere 'requests' und 'beautifulsoup4'."
+            )
+            if REQUESTS_IMPORT_ERROR:
+                print(f"DokuStreamsPlugin Importfehler: {REQUESTS_IMPORT_ERROR}")
+
+    async def search_titles(self, query: str) -> List[str]:
+        hits = self._search_hits(query)
+        self._title_to_url = {hit.title: hit.url for hit in hits if hit.title and hit.url}
+        for hit in hits:
+            if hit.title:
+                self._title_meta[hit.title] = (hit.plot, hit.poster)
+        titles = [hit.title for hit in hits if hit.title]
+        titles.sort(key=lambda value: value.casefold())
+        return titles
+
+    def _search_hits(self, query: str) -> List[SearchHit]:
+        query = (query or "").strip()
+        if not query or not self._requests_available:
+            return []
+        search_url = _absolute_url(f"/?s={quote(query)}")
+        session = get_requests_session("dokustreams", headers=HEADERS)
+        try:
+            soup = _get_soup(search_url, session=session)
+        except Exception:
+            return []
+        return _parse_listing_hits(soup, query=query)
+
+    def capabilities(self) -> set[str]:
+        return {"genres", "popular_series"}
+
+    def _categories_url(self) -> str:
+        return _absolute_url("/kategorien/")
+
+    def _parse_categories(self, soup: BeautifulSoupT) -> Dict[str, str]:
+        categories: Dict[str, str] = {}
+        if not soup:
+            return categories
+        root = soup.select_one("ul.nested-category-list")
+        if root is None:
+            return categories
+
+        def clean_name(value: str) -> str:
+            value = (value or "").strip()
+            return re.sub(r"\\s*\\(\\d+\\)\\s*$", "", value).strip()
+
+        def walk(ul, parents: List[str]) -> None:
+            for li in ul.find_all("li", recursive=False):
+                anchor = li.find("a", href=True)
+                if anchor is None:
+                    continue
+                name = clean_name(anchor.get_text(" ", strip=True) or "")
+                href = (anchor.get("href") or "").strip()
+                if not name or not href:
+                    continue
+                child_ul = li.find("ul", class_="nested-category-list")
+                if child_ul is not None:
+                    walk(child_ul, parents + [name])
+                else:
+                    if parents:
+                        label = " \u2192 ".join(parents + [name])
+                        categories[label] = _absolute_url(href)
+
+        walk(root, [])
+        return categories
+
+    def _parse_top_categories(self, soup: BeautifulSoupT) -> Dict[str, str]:
+        categories: Dict[str, str] = {}
+        if not soup:
+            return categories
+        root = soup.select_one("ul.nested-category-list")
+        if root is None:
+            return categories
+        for li in root.find_all("li", recursive=False):
+            anchor = li.find("a", href=True)
+            if anchor is None:
+                continue
+            name = (anchor.get_text(" ", strip=True) or "").strip()
+            href = (anchor.get("href") or "").strip()
+            if not name or not href:
+                continue
+            categories[name] = _absolute_url(href)
+        return categories
+
+    def genres(self) -> List[str]:
+        if not self._requests_available:
+            return []
+        if self._category_to_url:
+            return sorted(self._category_to_url.keys(), key=lambda value: value.casefold())
+        try:
+            soup = _get_soup(self._categories_url(), session=get_requests_session("dokustreams", headers=HEADERS))
+        except Exception:
+            return []
+        parsed = self._parse_categories(soup)
+        if parsed:
+            self._category_to_url = dict(parsed)
+        return sorted(self._category_to_url.keys(), key=lambda value: value.casefold())
+
+    def categories(self) -> List[str]:
+        if not self._requests_available:
+            return []
+        try:
+            soup = _get_soup(self._categories_url(), session=get_requests_session("dokustreams", headers=HEADERS))
+        except Exception:
+            return []
+        parsed = self._parse_top_categories(soup)
+        if parsed:
+            for key, value in parsed.items():
+                self._category_to_url.setdefault(key, value)
+        return list(parsed.keys())
+
+    def genre_page_count(self, genre: str) -> int:
+        genre = (genre or "").strip()
+        if not genre:
+            return 1
+        if genre in self._category_page_count_cache:
+            return max(1, int(self._category_page_count_cache.get(genre, 1)))
+        if not self._category_to_url:
+            self.genres()
+        base_url = self._category_to_url.get(genre, "")
+        if not base_url:
+            return 1
+        try:
+            soup = _get_soup(base_url, session=get_requests_session("dokustreams", headers=HEADERS))
+        except Exception:
+            return 1
+        pages = _extract_last_page(soup)
+        self._category_page_count_cache[genre] = max(1, pages)
+        return self._category_page_count_cache[genre]
+
+    def titles_for_genre_page(self, genre: str, page: int) -> List[str]:
+        genre = (genre or "").strip()
+        if not genre or not self._requests_available:
+            return []
+        if not self._category_to_url:
+            self.genres()
+        base_url = self._category_to_url.get(genre, "")
+        if not base_url:
+            return []
+        page = max(1, int(page or 1))
+        url = base_url if page == 1 else f"{base_url.rstrip('/')}/page/{page}/"
+        try:
+            soup = _get_soup(url, session=get_requests_session("dokustreams", headers=HEADERS))
+        except Exception:
+            return []
+        hits = _parse_listing_hits(soup)
+        for hit in hits:
+            if hit.title:
+                self._title_meta[hit.title] = (hit.plot, hit.poster)
+        titles = [hit.title for hit in hits if hit.title]
+        self._title_to_url.update({hit.title: hit.url for hit in hits if hit.title and hit.url})
+        return titles
+
+    def titles_for_genre(self, genre: str) -> List[str]:
+        titles = self.titles_for_genre_page(genre, 1)
+        titles.sort(key=lambda value: value.casefold())
+        return titles
+
+    def _most_viewed_url(self) -> str:
+        return _absolute_url(MOST_VIEWED_PATH)
+
+    def popular_series(self) -> List[str]:
+        if not self._requests_available:
+            return []
+        if self._popular_cache is not None:
+            titles = [hit.title for hit in self._popular_cache if hit.title]
+            titles.sort(key=lambda value: value.casefold())
+            return titles
+        try:
+            soup = _get_soup(self._most_viewed_url(), session=get_requests_session("dokustreams", headers=HEADERS))
+        except Exception:
+            return []
+        hits = _parse_listing_hits(soup)
+        self._popular_cache = list(hits)
+        self._title_to_url.update({hit.title: hit.url for hit in hits if hit.title and hit.url})
+        for hit in hits:
+            if hit.title:
+                self._title_meta[hit.title] = (hit.plot, hit.poster)
+        titles = [hit.title for hit in hits if hit.title]
+        titles.sort(key=lambda value: value.casefold())
+        return titles
+
+    def metadata_for(self, title: str) -> tuple[dict[str, str], dict[str, str], list[object] | None]:
+        title = (title or "").strip()
+        if not title:
+            return {}, {}, None
+        plot, poster = self._title_meta.get(title, ("", ""))
+        info: dict[str, str] = {"title": title}
+        if plot:
+            info["plot"] = plot
+        art: dict[str, str] = {}
+        if poster:
+            art = {"thumb": poster, "poster": poster}
+        return info, art, None
+
+    def seasons_for(self, title: str) -> List[str]:
+        title = (title or "").strip()
+        if not title or title not in self._title_to_url:
+            return []
+        return ["Stream"]
+
+    def episodes_for(self, title: str, season: str) -> List[str]:
+        title = (title or "").strip()
+        if not title or title not in self._title_to_url:
+            return []
+        return [title]
+
+    def stream_link_for(self, title: str, season: str, episode: str) -> Optional[str]:
+        title = (title or "").strip()
+        if not title:
+            return None
+        url = self._title_to_url.get(title)
+        if not url:
+            return None
+        if not self._requests_available:
+            return None
+        try:
+            soup = _get_soup(url, session=get_requests_session("dokustreams", headers=HEADERS))
+        except Exception:
+            return None
+        iframe = soup.select_one("div.fluid-width-video-wrapper iframe[src]")
+        if iframe is None:
+            iframe = soup.select_one("iframe[src*='youtube'], iframe[src*='vimeo'], iframe[src]")
+        if iframe is None:
+            return None
+        src = (iframe.get("src") or "").strip()
+        if not src:
+            return None
+        return _absolute_url(src)
+
+
+# Alias für die automatische Plugin-Erkennung.
+Plugin = DokuStreamsPlugin