Nightly: refactor readability, progress callbacks, and resource handling
This commit is contained in:
@@ -17,7 +17,7 @@ import os
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
try: # pragma: no cover - optional dependency
|
||||
@@ -80,6 +80,16 @@ SESSION_CACHE_MAX_TITLE_URLS = 800
|
||||
CATALOG_SEARCH_TTL_SECONDS = 600
|
||||
CATALOG_SEARCH_CACHE_KEY = "catalog_index"
|
||||
_CATALOG_INDEX_MEMORY: tuple[float, List["SeriesResult"]] = (0.0, [])
|
||||
ProgressCallback = Optional[Callable[[str, Optional[int]], Any]]
|
||||
|
||||
|
||||
def _emit_progress(callback: ProgressCallback, message: str, percent: Optional[int] = None) -> None:
|
||||
if not callable(callback):
|
||||
return
|
||||
try:
|
||||
callback(str(message or ""), None if percent is None else int(percent))
|
||||
except Exception:
|
||||
return
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -398,37 +408,56 @@ def _get_soup(url: str, *, session: Optional[RequestsSession] = None) -> Beautif
|
||||
_ensure_requests()
|
||||
_log_visit(url)
|
||||
sess = session or get_requests_session("serienstream", headers=HEADERS)
|
||||
response = None
|
||||
try:
|
||||
response = sess.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
except Exception as exc:
|
||||
_log_error(f"GET {url} failed: {exc}")
|
||||
raise
|
||||
if response.url and response.url != url:
|
||||
_log_url(response.url, kind="REDIRECT")
|
||||
_log_response_html(url, response.text)
|
||||
if _looks_like_cloudflare_challenge(response.text):
|
||||
raise RuntimeError("Cloudflare-Schutz erkannt. requests reicht ggf. nicht aus.")
|
||||
return BeautifulSoup(response.text, "html.parser")
|
||||
try:
|
||||
final_url = (response.url or url) if response is not None else url
|
||||
body = (response.text or "") if response is not None else ""
|
||||
if final_url != url:
|
||||
_log_url(final_url, kind="REDIRECT")
|
||||
_log_response_html(url, body)
|
||||
if _looks_like_cloudflare_challenge(body):
|
||||
raise RuntimeError("Cloudflare-Schutz erkannt. requests reicht ggf. nicht aus.")
|
||||
return BeautifulSoup(body, "html.parser")
|
||||
finally:
|
||||
if response is not None:
|
||||
try:
|
||||
response.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _get_html_simple(url: str) -> str:
|
||||
_ensure_requests()
|
||||
_log_visit(url)
|
||||
sess = get_requests_session("serienstream", headers=HEADERS)
|
||||
response = None
|
||||
try:
|
||||
response = sess.get(url, headers=HEADERS, timeout=DEFAULT_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
except Exception as exc:
|
||||
_log_error(f"GET {url} failed: {exc}")
|
||||
raise
|
||||
if response.url and response.url != url:
|
||||
_log_url(response.url, kind="REDIRECT")
|
||||
body = response.text
|
||||
_log_response_html(url, body)
|
||||
if _looks_like_cloudflare_challenge(body):
|
||||
raise RuntimeError("Cloudflare-Schutz erkannt. requests reicht ggf. nicht aus.")
|
||||
return body
|
||||
try:
|
||||
final_url = (response.url or url) if response is not None else url
|
||||
body = (response.text or "") if response is not None else ""
|
||||
if final_url != url:
|
||||
_log_url(final_url, kind="REDIRECT")
|
||||
_log_response_html(url, body)
|
||||
if _looks_like_cloudflare_challenge(body):
|
||||
raise RuntimeError("Cloudflare-Schutz erkannt. requests reicht ggf. nicht aus.")
|
||||
return body
|
||||
finally:
|
||||
if response is not None:
|
||||
try:
|
||||
response.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _get_soup_simple(url: str) -> BeautifulSoupT:
|
||||
@@ -472,6 +501,7 @@ def _search_series_api(query: str) -> List[SeriesResult]:
|
||||
terms.extend([token for token in query.split() if token])
|
||||
seen_urls: set[str] = set()
|
||||
for term in terms:
|
||||
response = None
|
||||
try:
|
||||
response = sess.get(
|
||||
f"{_get_base_url()}/api/search/suggest",
|
||||
@@ -486,6 +516,12 @@ def _search_series_api(query: str) -> List[SeriesResult]:
|
||||
payload = response.json()
|
||||
except Exception:
|
||||
continue
|
||||
finally:
|
||||
if response is not None:
|
||||
try:
|
||||
response.close()
|
||||
except Exception:
|
||||
pass
|
||||
shows = payload.get("shows") if isinstance(payload, dict) else None
|
||||
if not isinstance(shows, list):
|
||||
continue
|
||||
@@ -558,7 +594,7 @@ def _search_series_server(query: str) -> List[SeriesResult]:
|
||||
return []
|
||||
|
||||
|
||||
def _extract_catalog_index_from_html(body: str) -> List[SeriesResult]:
|
||||
def _extract_catalog_index_from_html(body: str, *, progress_callback: ProgressCallback = None) -> List[SeriesResult]:
|
||||
items: List[SeriesResult] = []
|
||||
if not body:
|
||||
return items
|
||||
@@ -569,7 +605,9 @@ def _extract_catalog_index_from_html(body: str) -> List[SeriesResult]:
|
||||
)
|
||||
anchor_re = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
|
||||
data_search_re = re.compile(r"data-search=[\"']([^\"']*)[\"']", re.IGNORECASE)
|
||||
for match in item_re.finditer(body):
|
||||
for idx, match in enumerate(item_re.finditer(body), start=1):
|
||||
if idx == 1 or idx % 200 == 0:
|
||||
_emit_progress(progress_callback, f"Katalog parsen {idx}", 62)
|
||||
block = match.group(0)
|
||||
inner = match.group(1) or ""
|
||||
anchor_match = anchor_re.search(inner)
|
||||
@@ -651,26 +689,33 @@ def _store_catalog_index_in_cache(items: List[SeriesResult]) -> None:
|
||||
_session_cache_set(CATALOG_SEARCH_CACHE_KEY, payload, ttl_seconds=CATALOG_SEARCH_TTL_SECONDS)
|
||||
|
||||
|
||||
def search_series(query: str) -> List[SeriesResult]:
|
||||
def search_series(query: str, *, progress_callback: ProgressCallback = None) -> List[SeriesResult]:
|
||||
"""Sucht Serien im (/serien)-Katalog nach Titel. Nutzt Cache + Ein-Pass-Filter."""
|
||||
_ensure_requests()
|
||||
if not _normalize_search_text(query):
|
||||
return []
|
||||
_emit_progress(progress_callback, "Server-Suche", 15)
|
||||
server_results = _search_series_server(query)
|
||||
if server_results:
|
||||
_emit_progress(progress_callback, f"Server-Treffer: {len(server_results)}", 35)
|
||||
return [entry for entry in server_results if entry.title and _matches_query(query, title=entry.title)]
|
||||
_emit_progress(progress_callback, "Pruefe Such-Cache", 42)
|
||||
cached = _load_catalog_index_from_cache()
|
||||
if cached is not None:
|
||||
_emit_progress(progress_callback, f"Cache-Treffer: {len(cached)}", 52)
|
||||
return [entry for entry in cached if entry.title and _matches_query(query, title=entry.title)]
|
||||
|
||||
_emit_progress(progress_callback, "Lade Katalogseite", 58)
|
||||
catalog_url = f"{_get_base_url()}/serien?by=genre"
|
||||
body = _get_html_simple(catalog_url)
|
||||
items = _extract_catalog_index_from_html(body)
|
||||
items = _extract_catalog_index_from_html(body, progress_callback=progress_callback)
|
||||
if not items:
|
||||
_emit_progress(progress_callback, "Fallback-Parser", 70)
|
||||
soup = BeautifulSoup(body, "html.parser")
|
||||
items = _catalog_index_from_soup(soup)
|
||||
if items:
|
||||
_store_catalog_index_in_cache(items)
|
||||
_emit_progress(progress_callback, f"Filtere Treffer ({len(items)})", 85)
|
||||
return [entry for entry in items if entry.title and _matches_query(query, title=entry.title)]
|
||||
|
||||
|
||||
@@ -989,15 +1034,23 @@ def resolve_redirect(target_url: str) -> Optional[str]:
|
||||
_get_soup(_get_base_url(), session=session)
|
||||
except Exception:
|
||||
pass
|
||||
response = session.get(
|
||||
normalized_url,
|
||||
headers=HEADERS,
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
allow_redirects=True,
|
||||
)
|
||||
if response.url:
|
||||
_log_url(response.url, kind="RESOLVED")
|
||||
return response.url if response.url else None
|
||||
response = None
|
||||
try:
|
||||
response = session.get(
|
||||
normalized_url,
|
||||
headers=HEADERS,
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
allow_redirects=True,
|
||||
)
|
||||
if response.url:
|
||||
_log_url(response.url, kind="RESOLVED")
|
||||
return response.url if response.url else None
|
||||
finally:
|
||||
if response is not None:
|
||||
try:
|
||||
response.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def scrape_series_detail(
|
||||
@@ -1681,7 +1734,7 @@ class SerienstreamPlugin(BasisPlugin):
|
||||
return self._episode_label_cache.get(cache_key, {}).get(episode_label)
|
||||
return None
|
||||
|
||||
async def search_titles(self, query: str) -> List[str]:
|
||||
async def search_titles(self, query: str, progress_callback: ProgressCallback = None) -> List[str]:
|
||||
query = query.strip()
|
||||
if not query:
|
||||
self._series_results.clear()
|
||||
@@ -1695,7 +1748,8 @@ class SerienstreamPlugin(BasisPlugin):
|
||||
try:
|
||||
# Nutzt den Katalog (/serien), der jetzt nach Genres gruppiert ist.
|
||||
# Alternativ gäbe es ein Ajax-Endpoint, aber der ist nicht immer zuverlässig erreichbar.
|
||||
results = search_series(query)
|
||||
_emit_progress(progress_callback, "Serienstream Suche startet", 10)
|
||||
results = search_series(query, progress_callback=progress_callback)
|
||||
except Exception as exc: # pragma: no cover - defensive logging
|
||||
self._series_results.clear()
|
||||
self._season_cache.clear()
|
||||
@@ -1708,6 +1762,7 @@ class SerienstreamPlugin(BasisPlugin):
|
||||
self._season_cache.clear()
|
||||
self._season_links_cache.clear()
|
||||
self._episode_label_cache.clear()
|
||||
_emit_progress(progress_callback, f"Treffer aufbereitet: {len(results)}", 95)
|
||||
return [result.title for result in results]
|
||||
|
||||
def _ensure_seasons(self, title: str) -> List[SeasonInfo]:
|
||||
|
||||
Reference in New Issue
Block a user