dev: bump to 0.1.72-dev – HDFilme Neufassung (BeautifulSoup, korrekte Selektoren, Genres, Metadaten)

This commit is contained in:
2026-03-04 23:07:44 +01:00
parent 58da715723
commit 957a5a1aea
2 changed files with 260 additions and 129 deletions

View File

@@ -1,5 +1,5 @@
<?xml version='1.0' encoding='utf-8'?> <?xml version='1.0' encoding='utf-8'?>
<addon id="plugin.video.viewit" name="ViewIt" version="0.1.71-dev" provider-name="ViewIt"> <addon id="plugin.video.viewit" name="ViewIt" version="0.1.72-dev" provider-name="ViewIt">
<requires> <requires>
<import addon="xbmc.python" version="3.0.0" /> <import addon="xbmc.python" version="3.0.0" />
<import addon="script.module.requests" /> <import addon="script.module.requests" />

View File

@@ -1,9 +1,7 @@
"""HDFilme Plugin für ViewIT. """HDFilme Plugin für ViewIT.
HTML-Scraping von hdfilme.garden. HTML-Scraping von hdfilme-tv.cc (ehemals hdfilme.garden).
Filme und Serien, Hoster-Auflösung via ResolveURL. Filme und Serien, Hoster-Auflösung via ResolveURL.
Hinweis: Die Domain ändert sich gelegentlich als DOMAIN-Konstante konfigurierbar.
""" """
from __future__ import annotations from __future__ import annotations
@@ -14,8 +12,10 @@ from urllib.parse import quote_plus
try: # pragma: no cover try: # pragma: no cover
import requests import requests
from bs4 import BeautifulSoup
except ImportError as exc: # pragma: no cover except ImportError as exc: # pragma: no cover
requests = None requests = None
BeautifulSoup = None
REQUESTS_AVAILABLE = False REQUESTS_AVAILABLE = False
REQUESTS_IMPORT_ERROR = exc REQUESTS_IMPORT_ERROR = exc
else: else:
@@ -28,51 +28,100 @@ from plugin_interface import BasisPlugin
# Konstanten # Konstanten
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
DOMAIN = "hdfilme.garden" BASE_URL = "https://hdfilme-tv.cc"
BASE_URL = "https://" + DOMAIN
DEFAULT_TIMEOUT = 20 DEFAULT_TIMEOUT = 20
HEADERS = { HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8", "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
"Referer": BASE_URL + "/", "Connection": "keep-alive",
} }
_URL_SEARCH = BASE_URL + "/index.php?do=search&subaction=search&story={query}" _URL_SEARCH = BASE_URL + "/?s={query}"
_URL_NEW = BASE_URL + "/kinofilme-online/" _URL_NEW = BASE_URL + "/kinofilme-online/"
_URL_SERIES = BASE_URL + "/serienstream-deutsch/" _URL_SERIES = BASE_URL + "/serienstream-deutsch/"
# HTML-Parsing-Muster # Genre-Slug → URL-Pfad
_RE_ENTRIES = re.compile( GENRE_SLUGS: dict[str, str] = {
r'<div class="box-product.*?href="([^"]+)[^>]*>([^<]+).*?data-src="([^"]+)', "Abenteuer": "abenteuer",
re.DOTALL, "Action": "action",
) "Animation": "animation",
_RE_EPISODES = re.compile(r'><a href="#">([^<]+)') "Biographie": "biographie",
_RE_HOSTERS = re.compile(r'link="([^"]+)"') "Dokumentation": "dokumentation",
_RE_THUMB_STANDALONE = re.compile(r'data-src="([^"]+)"') "Drama": "drama",
"Erotik": "erotikfilme",
"Familie": "familie",
"Fantasy": "fantasy",
"Historienfilm": "historien",
"Horror": "horror",
"Komödie": "komodie",
"Krieg": "krieg",
"Krimi": "krimi",
"Musikfilm": "musikfilme",
"Mystery": "mystery",
"Romantik": "romantik",
"Sci-Fi": "sci-fi",
"Sport": "sport",
"Thriller": "thriller",
"Western": "western",
}
_SKIP_HOSTERS = {"youtube", "dropload"} # Hoster die übersprungen werden (kein Stream / nur Trailer)
_SKIP_LINK_KEYWORDS = ("youtube.com", "youtu.be")
ProgressCallback = Optional[Callable[[str, Optional[int]], Any]] ProgressCallback = Optional[Callable[[str, Optional[int]], Any]]
# ---------------------------------------------------------------------------
# Hilfsfunktionen
# ---------------------------------------------------------------------------
def _absolute_url(url: str) -> str:
"""Macht eine relative oder protokoll-relative URL absolut."""
url = (url or "").strip()
if url.startswith("//"):
return "https:" + url
if url.startswith("/"):
return BASE_URL + url
return url
def _clean_title(raw: str) -> str:
"""Bereinigt einen Rohtitel von Seiten-Suffixen."""
title = (raw or "").strip()
for suffix in (" stream", " Stream", " kostenlos", " Deutsch", " German", " online"):
if title.endswith(suffix):
title = title[: -len(suffix)].strip()
return title
def _get_soup(url: str) -> Any:
"""HTTP-GET und BeautifulSoup-Parsing. Gibt None bei Fehler."""
if requests is None or BeautifulSoup is None:
return None
try:
response = requests.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
except Exception:
return None
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Plugin-Klasse # Plugin-Klasse
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
class HDFilmePlugin(BasisPlugin): class HdfilmePlugin(BasisPlugin):
"""HDFilme Integration für ViewIT (hdfilme.garden).""" """HDFilme Integration für ViewIT. HTML-Scraping via BeautifulSoup."""
name = "HDFilme" name = "HDFilme"
def __init__(self) -> None: def __init__(self) -> None:
# title → Detail-Page-URL
self._title_to_url: dict[str, str] = {} self._title_to_url: dict[str, str] = {}
# title → (plot, poster, fanart)
self._title_meta: dict[str, tuple[str, str, str]] = {}
# title → True wenn Serie
self._is_series: dict[str, bool] = {} self._is_series: dict[str, bool] = {}
self._title_meta: dict[str, tuple[str, str]] = {} # title → (plot, poster)
self._episode_cache: dict[str, list[str]] = {} # detail_url → episode labels
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Verfügbarkeit # Verfügbarkeit
@@ -86,154 +135,195 @@ class HDFilmePlugin(BasisPlugin):
def unavailable_reason(self) -> str: def unavailable_reason(self) -> str:
if REQUESTS_AVAILABLE: if REQUESTS_AVAILABLE:
return "" return ""
return f"requests nicht verfügbar: {REQUESTS_IMPORT_ERROR}" return f"requests/bs4 nicht verfügbar: {REQUESTS_IMPORT_ERROR}"
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# HTTP # Internes Parsing
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def _get_session(self): # type: ignore[return] def _parse_entries(self, soup: Any) -> list[str]:
from http_session_pool import get_requests_session """Parst eine Listing-Seite und gibt Titel zurück (cached)."""
return get_requests_session("hdfilme", headers=HEADERS) if soup is None:
return []
def _get_html(self, url: str) -> str:
session = self._get_session()
response = None
try:
response = session.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
return response.text
except Exception:
return ""
finally:
if response is not None:
try:
response.close()
except Exception:
pass
# ------------------------------------------------------------------
# Interne Hilfsmethoden
# ------------------------------------------------------------------
def _parse_entries(self, html: str) -> List[str]:
"""Parst Ergebnisseite und cached Einträge. Gibt Titelliste zurück."""
titles: list[str] = [] titles: list[str] = []
for m in _RE_ENTRIES.finditer(html): seen: set[str] = set()
raw_url, raw_title, raw_thumb = m.group(1), m.group(2), m.group(3) for box in soup.select("div.box-product"):
title = raw_title.strip() # URL aus erstem Link
if not title: link = box.find("a", href=True)
if not link:
continue
url = _absolute_url(link["href"])
if not url.endswith(".html"):
continue continue
# Absolute URL sicherstellen # Titel aus h3
url = raw_url.strip() h3_a = box.select_one("h3 a")
if url.startswith("/"): if not h3_a:
url = BASE_URL + url
if not url.startswith("http"):
continue continue
raw_title = h3_a.get_text(strip=True)
title = _clean_title(raw_title)
if not title or title in seen:
continue
seen.add(title)
thumb = raw_thumb.strip() # Thumbnail
if thumb.startswith("/"): img = box.select_one("img.lazyload")
thumb = BASE_URL + thumb poster = ""
if img and img.get("data-src"):
poster = _absolute_url(img["data-src"])
# Serien-Erkennung via Titel
is_series = bool(re.search(r"\bStaffel\b|\bSeason\b", raw_title, re.I))
is_series = "taffel" in title # "Staffel" (xStream-Konvention)
self._title_to_url[title] = url self._title_to_url[title] = url
self._is_series[title] = is_series self._is_series[title] = is_series
self._title_meta[title] = ("", thumb, "") if poster:
self._title_meta[title] = ("", poster)
titles.append(title) titles.append(title)
return titles return titles
def _get_hoster_links(self, html: str, episode: str = "") -> List[str]: def _get_detail_soup(self, title: str) -> Any:
"""Extrahiert Hoster-URLs aus HTML, optional nach Episode gefiltert.""" """Lädt die Detailseite eines Titels."""
search_area = html url = self._title_to_url.get(title, "")
if episode: if not url:
# Episode-Abschnitt isolieren return None
m = re.search(re.escape(episode) + r"<.*?</ul>", html, re.DOTALL) return _get_soup(url)
if m:
search_area = m.group(0)
def _extract_hoster_links(self, soup: Any, episode_id: str = "") -> list[str]:
"""Extrahiert Hoster-Links aus einer Detailseite.
episode_id: wenn gesetzt, nur Links aus dem `<li id="{episode_id}">` Block.
"""
if soup is None:
return []
links: list[str] = [] links: list[str] = []
for m in _RE_HOSTERS.finditer(search_area):
link = m.group(1).strip() if episode_id:
if not link: # Serien-Episode: Links aus dem spezifischen Episode-Container
container = soup.select_one(f"li#{episode_id}")
if container is None:
return []
candidates = container.select("a[data-link]")
else:
# Film: Links aus .mirrors
candidates = soup.select(".mirrors [data-link]")
for el in candidates:
href = _absolute_url((el.get("data-link") or "").strip())
if not href:
continue continue
if link.startswith("//"): if any(kw in href for kw in _SKIP_LINK_KEYWORDS):
link = "https:" + link
name = link.split("//")[-1].split(".")[0].lower()
if name in _SKIP_HOSTERS:
continue continue
links.append(link) links.append(href)
return links return links
def _staffel_nr(self, season: str) -> int:
"""Extrahiert die Staffelnummer aus einem Label wie 'Staffel 2'."""
m = re.search(r"\d+", season or "")
return int(m.group()) if m else 1
def _ep_index(self, episode: str) -> int:
"""Extrahiert den Episode-Index aus einem Label wie 'Episode 3'."""
m = re.search(r"\d+", episode or "")
return int(m.group()) if m else 1
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Pflicht-Methoden # Pflicht-Methoden
# ------------------------------------------------------------------ # ------------------------------------------------------------------
async def search_titles( async def search_titles(
self, query: str, progress_callback: ProgressCallback = None self,
query: str,
progress_callback: ProgressCallback = None,
) -> List[str]: ) -> List[str]:
query = (query or "").strip()
if not query or not REQUESTS_AVAILABLE: if not query or not REQUESTS_AVAILABLE:
return [] return []
url = _URL_SEARCH.format(query=quote_plus(query)) url = _URL_SEARCH.format(query=quote_plus(query.strip()))
html = self._get_html(url) soup = _get_soup(url)
if not html: return self._parse_entries(soup)
return []
# Suche filtert clientseitig nach Titel
q_lower = query.lower()
all_titles = self._parse_entries(html)
return [t for t in all_titles if q_lower in t.lower()]
def seasons_for(self, title: str) -> List[str]: def seasons_for(self, title: str) -> List[str]:
title = (title or "").strip() title = (title or "").strip()
if not title: if not title:
return [] return []
if self._is_series.get(title): if self._is_series.get(title) is False:
# Staffelnummer aus Titel ableiten, falls vorhanden return ["Film"]
m = re.search(r"Staffel\s*(\d+)", title, re.IGNORECASE) if self._is_series.get(title) is True:
if m: m = re.search(r"Staffel\s*(\d+)|Season\s*(\d+)", title, re.I)
return [f"Staffel {m.group(1)}"] nr = int(m.group(1) or m.group(2)) if m else 1
return ["Staffel 1"] return [f"Staffel {nr}"]
# Unbekannt: Detailseite laden und prüfen
soup = self._get_detail_soup(title)
if soup and soup.select_one("div.series"):
self._is_series[title] = True
m = re.search(r"Staffel\s*(\d+)|Season\s*(\d+)", title, re.I)
nr = int(m.group(1) or m.group(2)) if m else 1
return [f"Staffel {nr}"]
self._is_series[title] = False
return ["Film"] return ["Film"]
def episodes_for(self, title: str, season: str) -> List[str]: def episodes_for(self, title: str, season: str) -> List[str]:
title = (title or "").strip() title = (title or "").strip()
season = (season or "").strip()
if not title: if not title:
return [] return []
if season == "Film": if season == "Film":
return [title] return [title]
url = self._title_to_url.get(title, "") detail_url = self._title_to_url.get(title, "")
if not url: cached = self._episode_cache.get(detail_url)
return [] if cached is not None:
return cached
html = self._get_html(url) staffel_nr = self._staffel_nr(season)
if not html: soup = self._get_detail_soup(title)
if soup is None:
return [title] return [title]
episodes = _RE_EPISODES.findall(html) # li IDs: "serie-{staffel}_{episode}"
return [ep.strip() for ep in episodes if ep.strip()] or [title] pattern = f"serie-{staffel_nr}_"
episode_items = [li for li in soup.select("li[id]") if li.get("id", "").startswith(pattern)]
# ------------------------------------------------------------------ labels: list[str] = []
# Stream for li in episode_items:
# ------------------------------------------------------------------ ep_id = li.get("id", "") # z.B. "serie-1_3"
ep_num_str = ep_id.split("_")[-1]
# Episodentitel aus erstem <a href="#">
a = li.find("a", href="#")
if a:
raw = a.get_text(strip=True)
# "Episoden 3" → "Episode 3"
ep_label = re.sub(r"^Episoden?\s*", "", raw, flags=re.I).strip()
label = f"Episode {ep_label}" if ep_label else f"Episode {ep_num_str}"
else:
label = f"Episode {ep_num_str}"
labels.append(label)
def stream_link_for( result = labels if labels else [title]
self, title: str, season: str, episode: str if detail_url:
) -> Optional[str]: self._episode_cache[detail_url] = result
return result
def stream_link_for(self, title: str, season: str, episode: str) -> Optional[str]:
title = (title or "").strip() title = (title or "").strip()
url = self._title_to_url.get(title, "") season = (season or "").strip()
if not url: if not title:
return None return None
html = self._get_html(url) soup = self._get_detail_soup(title)
if not html: if soup is None:
return None return None
# Für Serien: nach Episode-Abschnitt filtern (wenn episode != title) if season == "Film" or not self._is_series.get(title, False):
ep_filter = "" if (season == "Film" or episode == title) else episode # Film: .mirrors [data-link]
links = self._get_hoster_links(html, ep_filter) links = self._extract_hoster_links(soup)
else:
# Serie: Episode-Container
staffel_nr = self._staffel_nr(season)
ep_idx = self._ep_index(episode)
episode_id = f"serie-{staffel_nr}_{ep_idx}"
links = self._extract_hoster_links(soup, episode_id)
return links[0] if links else None return links[0] if links else None
def resolve_stream_link(self, link: str) -> Optional[str]: def resolve_stream_link(self, link: str) -> Optional[str]:
@@ -252,7 +342,7 @@ class HDFilmePlugin(BasisPlugin):
def metadata_for( def metadata_for(
self, title: str self, title: str
) -> tuple[dict[str, str], dict[str, str], list | None]: ) -> tuple[dict[str, str], dict[str, str], list[object] | None]:
title = (title or "").strip() title = (title or "").strip()
if not title: if not title:
return {}, {}, None return {}, {}, None
@@ -260,17 +350,40 @@ class HDFilmePlugin(BasisPlugin):
info: dict[str, str] = {"title": title} info: dict[str, str] = {"title": title}
art: dict[str, str] = {} art: dict[str, str] = {}
# Cache-Hit
cached = self._title_meta.get(title) cached = self._title_meta.get(title)
if cached: if cached:
plot, poster, fanart = cached plot, poster = cached
if plot: if plot:
info["plot"] = plot info["plot"] = plot
if poster: if poster:
art["thumb"] = poster art["thumb"] = art["poster"] = poster
art["poster"] = poster if info or art:
if fanart: return info, art, None
art["fanart"] = fanart
# Detailseite laden
soup = self._get_detail_soup(title)
if soup is None:
return info, art, None
og_desc = soup.find("meta", attrs={"property": "og:description"})
if og_desc and og_desc.get("content"):
info["plot"] = og_desc["content"].strip()
og_img = soup.find("meta", attrs={"property": "og:image"})
poster = ""
if og_img and og_img.get("content"):
poster = _absolute_url(og_img["content"].strip())
art["thumb"] = art["poster"] = poster
# Jahr aus Textabschnitt "Titel YYYY"
year_el = soup.select_one("p.text-capitalize")
if year_el:
m = re.search(r"\b(19|20)\d{2}\b", year_el.get_text())
if m:
info["year"] = m.group()
self._title_meta[title] = (info.get("plot", ""), poster)
return info, art, None return info, art, None
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@@ -278,12 +391,30 @@ class HDFilmePlugin(BasisPlugin):
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def latest_titles(self, page: int = 1) -> List[str]: def latest_titles(self, page: int = 1) -> List[str]:
html = self._get_html(_URL_NEW) if not REQUESTS_AVAILABLE:
return self._parse_entries(html) if html else [] return []
page = max(1, int(page or 1))
url = _URL_NEW if page == 1 else f"{_URL_NEW}page/{page}/"
return self._parse_entries(_get_soup(url))
def popular_series(self) -> List[str]: def popular_series(self) -> List[str]:
html = self._get_html(_URL_SERIES) if not REQUESTS_AVAILABLE:
return self._parse_entries(html) if html else [] return []
return self._parse_entries(_get_soup(_URL_SERIES))
def genres(self) -> List[str]:
return sorted(GENRE_SLUGS.keys())
def titles_for_genre(self, genre: str) -> List[str]:
return self.titles_for_genre_page(genre, 1)
def titles_for_genre_page(self, genre: str, page: int = 1) -> List[str]:
slug = GENRE_SLUGS.get(genre, "")
if not slug or not REQUESTS_AVAILABLE:
return []
page = max(1, int(page or 1))
url = f"{BASE_URL}/{slug}/" if page == 1 else f"{BASE_URL}/{slug}/page/{page}/"
return self._parse_entries(_get_soup(url))
def capabilities(self) -> set[str]: def capabilities(self) -> set[str]:
return {"latest_titles", "popular_series"} return {"latest_titles", "popular_series", "genres"}