Files
ViewIT/addon/plugins/dokustreams_plugin.py

500 lines
18 KiB
Python

"""Doku-Streams (doku-streams.com) Integration."""
from __future__ import annotations
from dataclasses import dataclass
import re
from urllib.parse import quote
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
try: # pragma: no cover - optional dependency
import requests
from bs4 import BeautifulSoup # type: ignore[import-not-found]
except ImportError as exc: # pragma: no cover - optional dependency
requests = None
BeautifulSoup = None
REQUESTS_AVAILABLE = False
REQUESTS_IMPORT_ERROR = exc
else:
REQUESTS_AVAILABLE = True
REQUESTS_IMPORT_ERROR = None
from plugin_interface import BasisPlugin
from plugin_helpers import dump_response_html, get_setting_bool, get_setting_string, log_error, log_url, notify_url
from http_session_pool import get_requests_session
if TYPE_CHECKING: # pragma: no cover
from requests import Session as RequestsSession
from bs4 import BeautifulSoup as BeautifulSoupT # type: ignore[import-not-found]
else: # pragma: no cover
RequestsSession = Any
BeautifulSoupT = Any
ADDON_ID = "plugin.video.viewit"
SETTING_BASE_URL = "doku_streams_base_url"
DEFAULT_BASE_URL = "https://doku-streams.com"
MOST_VIEWED_PATH = "/meistgesehene/"
DEFAULT_TIMEOUT = 20
GLOBAL_SETTING_LOG_URLS = "debug_log_urls"
GLOBAL_SETTING_DUMP_HTML = "debug_dump_html"
GLOBAL_SETTING_SHOW_URL_INFO = "debug_show_url_info"
GLOBAL_SETTING_LOG_ERRORS = "debug_log_errors"
SETTING_LOG_URLS = "log_urls_dokustreams"
SETTING_DUMP_HTML = "dump_html_dokustreams"
SETTING_SHOW_URL_INFO = "show_url_info_dokustreams"
SETTING_LOG_ERRORS = "log_errors_dokustreams"
ProgressCallback = Optional[Callable[[str, Optional[int]], Any]]
def _emit_progress(callback: ProgressCallback, message: str, percent: Optional[int] = None) -> None:
if not callable(callback):
return
try:
callback(str(message or ""), None if percent is None else int(percent))
except Exception:
return
HEADERS = {
"User-Agent": "Mozilla/5.0 (Kodi; ViewIt) AppleWebKit/537.36 (KHTML, like Gecko)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
"Connection": "keep-alive",
}
@dataclass(frozen=True)
class SearchHit:
title: str
url: str
plot: str = ""
poster: str = ""
def _extract_last_page(soup: BeautifulSoupT) -> int:
max_page = 1
if not soup:
return max_page
for anchor in soup.select("nav.navigation a[href], nav.pagination a[href], a.page-numbers[href]"):
text = (anchor.get_text(" ", strip=True) or "").strip()
for candidate in (text, (anchor.get("href") or "").strip()):
for value in re.findall(r"/page/(\\d+)/", candidate):
try:
max_page = max(max_page, int(value))
except Exception:
continue
for value in re.findall(r"(\\d+)", candidate):
try:
max_page = max(max_page, int(value))
except Exception:
continue
return max_page
def _extract_summary_and_poster(article: BeautifulSoupT) -> tuple[str, str]:
summary = ""
if article:
summary_box = article.select_one("div.entry-summary")
if summary_box is not None:
for p in summary_box.find_all("p"):
text = (p.get_text(" ", strip=True) or "").strip()
if text:
summary = text
break
poster = ""
if article:
img = article.select_one("div.entry-thumb img")
if img is not None:
poster = (img.get("data-src") or "").strip() or (img.get("src") or "").strip()
if "lazy_placeholder" in poster and img.get("data-src"):
poster = (img.get("data-src") or "").strip()
poster = _absolute_url(poster)
return summary, poster
def _parse_listing_hits(soup: BeautifulSoupT, *, query: str = "") -> List[SearchHit]:
hits: List[SearchHit] = []
if not soup:
return hits
seen_titles: set[str] = set()
seen_urls: set[str] = set()
for article in soup.select("article[id^='post-']"):
anchor = article.select_one("h2.entry-title a[href]")
if anchor is None:
continue
href = (anchor.get("href") or "").strip()
title = (anchor.get_text(" ", strip=True) or "").strip()
if not href or not title:
continue
if query and not _matches_query(query, title=title):
continue
url = _absolute_url(href).split("#", 1)[0].split("?", 1)[0].rstrip("/")
title_key = title.casefold()
url_key = url.casefold()
if title_key in seen_titles or url_key in seen_urls:
continue
seen_titles.add(title_key)
seen_urls.add(url_key)
_log_url_event(url, kind="PARSE")
summary, poster = _extract_summary_and_poster(article)
hits.append(SearchHit(title=title, url=url, plot=summary, poster=poster))
return hits
def _get_base_url() -> str:
base = get_setting_string(ADDON_ID, SETTING_BASE_URL, default=DEFAULT_BASE_URL).strip()
if not base:
base = DEFAULT_BASE_URL
return base.rstrip("/")
def _absolute_url(url: str) -> str:
url = (url or "").strip()
if not url:
return ""
if url.startswith("http://") or url.startswith("https://"):
return url
if url.startswith("//"):
return f"https:{url}"
if url.startswith("/"):
return f"{_get_base_url()}{url}"
return f"{_get_base_url()}/{url.lstrip('/')}"
def _normalize_search_text(value: str) -> str:
value = (value or "").casefold()
value = re.sub(r"[^a-z0-9]+", " ", value)
value = re.sub(r"\s+", " ", value).strip()
return value
def _matches_query(query: str, *, title: str) -> bool:
normalized_query = _normalize_search_text(query)
if not normalized_query:
return False
haystack = f" {_normalize_search_text(title)} "
return f" {normalized_query} " in haystack
def _log_url_event(url: str, *, kind: str = "VISIT") -> None:
log_url(
ADDON_ID,
enabled_setting_id=GLOBAL_SETTING_LOG_URLS,
plugin_setting_id=SETTING_LOG_URLS,
log_filename="dokustreams_urls.log",
url=url,
kind=kind,
)
def _log_visit(url: str) -> None:
_log_url_event(url, kind="VISIT")
notify_url(
ADDON_ID,
heading="Doku-Streams",
url=url,
enabled_setting_id=GLOBAL_SETTING_SHOW_URL_INFO,
plugin_setting_id=SETTING_SHOW_URL_INFO,
)
def _log_response_html(url: str, body: str) -> None:
dump_response_html(
ADDON_ID,
enabled_setting_id=GLOBAL_SETTING_DUMP_HTML,
plugin_setting_id=SETTING_DUMP_HTML,
url=url,
body=body,
filename_prefix="dokustreams_response",
)
def _log_error_message(message: str) -> None:
log_error(
ADDON_ID,
enabled_setting_id=GLOBAL_SETTING_LOG_ERRORS,
plugin_setting_id=SETTING_LOG_ERRORS,
log_filename="dokustreams_errors.log",
message=message,
)
def _get_soup(url: str, *, session: Optional[RequestsSession] = None) -> BeautifulSoupT:
if requests is None or BeautifulSoup is None:
raise RuntimeError("requests/bs4 sind nicht verfuegbar.")
_log_visit(url)
sess = session or get_requests_session("dokustreams", headers=HEADERS)
response = None
try:
response = sess.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
except Exception as exc:
_log_error_message(f"GET {url} failed: {exc}")
raise
try:
final_url = (response.url or url) if response is not None else url
body = (response.text or "") if response is not None else ""
if final_url != url:
_log_url_event(final_url, kind="REDIRECT")
_log_response_html(url, body)
return BeautifulSoup(body, "html.parser")
finally:
if response is not None:
try:
response.close()
except Exception:
pass
class DokuStreamsPlugin(BasisPlugin):
name = "Doku-Streams"
version = "1.0.0"
prefer_source_metadata = True
def __init__(self) -> None:
self._title_to_url: Dict[str, str] = {}
self._category_to_url: Dict[str, str] = {}
self._category_page_count_cache: Dict[str, int] = {}
self._popular_cache: Optional[List[SearchHit]] = None
self._title_meta: Dict[str, tuple[str, str]] = {}
self._requests_available = REQUESTS_AVAILABLE
self.is_available = True
self.unavailable_reason: Optional[str] = None
if not self._requests_available: # pragma: no cover - optional dependency
self.is_available = False
self.unavailable_reason = (
"requests/bs4 fehlen. Installiere 'requests' und 'beautifulsoup4'."
)
if REQUESTS_IMPORT_ERROR:
print(f"DokuStreamsPlugin Importfehler: {REQUESTS_IMPORT_ERROR}")
async def search_titles(self, query: str, progress_callback: ProgressCallback = None) -> List[str]:
_emit_progress(progress_callback, "Doku-Streams Suche", 15)
hits = self._search_hits(query)
_emit_progress(progress_callback, f"Treffer verarbeiten ({len(hits)})", 70)
self._title_to_url = {hit.title: hit.url for hit in hits if hit.title and hit.url}
for hit in hits:
if hit.title:
self._title_meta[hit.title] = (hit.plot, hit.poster)
titles = [hit.title for hit in hits if hit.title]
titles.sort(key=lambda value: value.casefold())
_emit_progress(progress_callback, f"Fertig: {len(titles)} Treffer", 95)
return titles
def _search_hits(self, query: str) -> List[SearchHit]:
query = (query or "").strip()
if not query or not self._requests_available:
return []
search_url = _absolute_url(f"/?s={quote(query)}")
session = get_requests_session("dokustreams", headers=HEADERS)
try:
soup = _get_soup(search_url, session=session)
except Exception:
return []
return _parse_listing_hits(soup, query=query)
def capabilities(self) -> set[str]:
return {"genres", "popular_series"}
def _categories_url(self) -> str:
return _absolute_url("/kategorien/")
def _parse_categories(self, soup: BeautifulSoupT) -> Dict[str, str]:
categories: Dict[str, str] = {}
if not soup:
return categories
root = soup.select_one("ul.nested-category-list")
if root is None:
return categories
def clean_name(value: str) -> str:
value = (value or "").strip()
return re.sub(r"\\s*\\(\\d+\\)\\s*$", "", value).strip()
def walk(ul, parents: List[str]) -> None:
for li in ul.find_all("li", recursive=False):
anchor = li.find("a", href=True)
if anchor is None:
continue
name = clean_name(anchor.get_text(" ", strip=True) or "")
href = (anchor.get("href") or "").strip()
if not name or not href:
continue
child_ul = li.find("ul", class_="nested-category-list")
if child_ul is not None:
walk(child_ul, parents + [name])
else:
if parents:
label = " \u2192 ".join(parents + [name])
categories[label] = _absolute_url(href)
walk(root, [])
return categories
def _parse_top_categories(self, soup: BeautifulSoupT) -> Dict[str, str]:
categories: Dict[str, str] = {}
if not soup:
return categories
root = soup.select_one("ul.nested-category-list")
if root is None:
return categories
for li in root.find_all("li", recursive=False):
anchor = li.find("a", href=True)
if anchor is None:
continue
name = (anchor.get_text(" ", strip=True) or "").strip()
href = (anchor.get("href") or "").strip()
if not name or not href:
continue
categories[name] = _absolute_url(href)
return categories
def genres(self) -> List[str]:
if not self._requests_available:
return []
if self._category_to_url:
return sorted(self._category_to_url.keys(), key=lambda value: value.casefold())
try:
soup = _get_soup(self._categories_url(), session=get_requests_session("dokustreams", headers=HEADERS))
except Exception:
return []
parsed = self._parse_categories(soup)
if parsed:
self._category_to_url = dict(parsed)
return sorted(self._category_to_url.keys(), key=lambda value: value.casefold())
def categories(self) -> List[str]:
if not self._requests_available:
return []
try:
soup = _get_soup(self._categories_url(), session=get_requests_session("dokustreams", headers=HEADERS))
except Exception:
return []
parsed = self._parse_top_categories(soup)
if parsed:
for key, value in parsed.items():
self._category_to_url.setdefault(key, value)
return list(parsed.keys())
def genre_page_count(self, genre: str) -> int:
genre = (genre or "").strip()
if not genre:
return 1
if genre in self._category_page_count_cache:
return max(1, int(self._category_page_count_cache.get(genre, 1)))
if not self._category_to_url:
self.genres()
base_url = self._category_to_url.get(genre, "")
if not base_url:
return 1
try:
soup = _get_soup(base_url, session=get_requests_session("dokustreams", headers=HEADERS))
except Exception:
return 1
pages = _extract_last_page(soup)
self._category_page_count_cache[genre] = max(1, pages)
return self._category_page_count_cache[genre]
def titles_for_genre_page(self, genre: str, page: int) -> List[str]:
genre = (genre or "").strip()
if not genre or not self._requests_available:
return []
if not self._category_to_url:
self.genres()
base_url = self._category_to_url.get(genre, "")
if not base_url:
return []
page = max(1, int(page or 1))
url = base_url if page == 1 else f"{base_url.rstrip('/')}/page/{page}/"
try:
soup = _get_soup(url, session=get_requests_session("dokustreams", headers=HEADERS))
except Exception:
return []
hits = _parse_listing_hits(soup)
for hit in hits:
if hit.title:
self._title_meta[hit.title] = (hit.plot, hit.poster)
titles = [hit.title for hit in hits if hit.title]
self._title_to_url.update({hit.title: hit.url for hit in hits if hit.title and hit.url})
return titles
def titles_for_genre(self, genre: str) -> List[str]:
titles = self.titles_for_genre_page(genre, 1)
titles.sort(key=lambda value: value.casefold())
return titles
def _most_viewed_url(self) -> str:
return _absolute_url(MOST_VIEWED_PATH)
def popular_series(self) -> List[str]:
if not self._requests_available:
return []
if self._popular_cache is not None:
titles = [hit.title for hit in self._popular_cache if hit.title]
titles.sort(key=lambda value: value.casefold())
return titles
try:
soup = _get_soup(self._most_viewed_url(), session=get_requests_session("dokustreams", headers=HEADERS))
except Exception:
return []
hits = _parse_listing_hits(soup)
self._popular_cache = list(hits)
self._title_to_url.update({hit.title: hit.url for hit in hits if hit.title and hit.url})
for hit in hits:
if hit.title:
self._title_meta[hit.title] = (hit.plot, hit.poster)
titles = [hit.title for hit in hits if hit.title]
titles.sort(key=lambda value: value.casefold())
return titles
def metadata_for(self, title: str) -> tuple[dict[str, str], dict[str, str], list[object] | None]:
title = (title or "").strip()
if not title:
return {}, {}, None
plot, poster = self._title_meta.get(title, ("", ""))
info: dict[str, str] = {"title": title}
if plot:
info["plot"] = plot
art: dict[str, str] = {}
if poster:
art = {"thumb": poster, "poster": poster}
return info, art, None
def seasons_for(self, title: str) -> List[str]:
title = (title or "").strip()
if not title or title not in self._title_to_url:
return []
return ["Stream"]
def episodes_for(self, title: str, season: str) -> List[str]:
title = (title or "").strip()
if not title or title not in self._title_to_url:
return []
return [title]
def stream_link_for(self, title: str, season: str, episode: str) -> Optional[str]:
title = (title or "").strip()
if not title:
return None
url = self._title_to_url.get(title)
if not url:
return None
if not self._requests_available:
return None
try:
soup = _get_soup(url, session=get_requests_session("dokustreams", headers=HEADERS))
except Exception:
return None
iframe = soup.select_one("div.fluid-width-video-wrapper iframe[src]")
if iframe is None:
iframe = soup.select_one("iframe[src*='youtube'], iframe[src*='vimeo'], iframe[src]")
if iframe is None:
return None
src = (iframe.get("src") or "").strip()
if not src:
return None
return _absolute_url(src)
# Alias für die automatische Plugin-Erkennung.
Plugin = DokuStreamsPlugin