From 76b04ddaf225a154056f5a2c4eb6b0f5b5abcbde Mon Sep 17 00:00:00 2001 From: "itdrui.de" Date: Tue, 24 Feb 2026 18:50:31 +0100 Subject: [PATCH] dev: normalize filter.genre_* labels in genre parsing --- addon/plugins/aniworld_plugin.py | 26 ++++++++++++++-- addon/plugins/serienstream_plugin.py | 46 ++++++++++++++++++++++++++-- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/addon/plugins/aniworld_plugin.py b/addon/plugins/aniworld_plugin.py index 943e865..03f85da 100644 --- a/addon/plugins/aniworld_plugin.py +++ b/addon/plugins/aniworld_plugin.py @@ -357,6 +357,28 @@ def _get_soup_simple(url: str) -> BeautifulSoupT: return BeautifulSoup(body, "html.parser") +def _normalize_genre_label(raw: str) -> str: + text = unescape(re.sub(r"\s+", " ", str(raw or ""))).strip() + if not text: + return "" + key_prefix = "filter.genre_" + if text.casefold().startswith(key_prefix): + slug = text[len(key_prefix) :].strip().casefold() + slug = slug.replace("_", "-") + slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-") + if not slug: + return "" + special = { + "doku-soap": "Doku-Soap", + "scifi": "SciFi", + "fighting-shounen": "Fighting-Shounen", + } + if slug in special: + return special[slug] + return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk) + return text + + def _extract_genre_names_from_html(body: str) -> List[str]: names: List[str] = [] seen: set[str] = set() @@ -366,7 +388,7 @@ def _extract_genre_names_from_html(body: str) -> List[str]: ) for match in pattern.finditer(body or ""): text = re.sub(r"<[^>]+>", " ", match.group(1) or "") - text = unescape(re.sub(r"\s+", " ", text)).strip() + text = _normalize_genre_label(text) if not text: continue key = text.casefold() @@ -1193,7 +1215,7 @@ class AniworldPlugin(BasisPlugin): genre_blocks = soup.select("div.genre") for genre_block in genre_blocks: name_node = genre_block.select_one(".seriesGenreList h3") - genre_name = (name_node.get_text(" ", strip=True) if name_node else "").strip() + genre_name = _normalize_genre_label(name_node.get_text(" ", strip=True) if name_node else "") if not genre_name: continue entries: List[SeriesResult] = [] diff --git a/addon/plugins/serienstream_plugin.py b/addon/plugins/serienstream_plugin.py index aebfe1b..8600b30 100644 --- a/addon/plugins/serienstream_plugin.py +++ b/addon/plugins/serienstream_plugin.py @@ -468,6 +468,27 @@ def _get_soup_simple(url: str) -> BeautifulSoupT: def _extract_genre_names_from_html(body: str) -> List[str]: + def _normalize_genre_label(raw: str) -> str: + text = unescape(re.sub(r"\s+", " ", str(raw or ""))).strip() + if not text: + return "" + key_prefix = "filter.genre_" + if text.casefold().startswith(key_prefix): + slug = text[len(key_prefix) :].strip().casefold() + slug = slug.replace("_", "-") + slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-") + if not slug: + return "" + special = { + "doku-soap": "Doku-Soap", + "scifi": "SciFi", + "fighting-shounen": "Fighting-Shounen", + } + if slug in special: + return special[slug] + return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk) + return text + names: List[str] = [] seen: set[str] = set() pattern = re.compile( @@ -476,7 +497,7 @@ def _extract_genre_names_from_html(body: str) -> List[str]: ) for match in pattern.finditer(body or ""): text = re.sub(r"<[^>]+>", " ", match.group(1) or "") - text = unescape(re.sub(r"\s+", " ", text)).strip() + text = _normalize_genre_label(text) if not text: continue key = text.casefold() @@ -726,11 +747,32 @@ def parse_series_catalog(soup: BeautifulSoupT) -> Dict[str, List[SeriesResult]]: """Parst die Serien-Übersicht (/serien) und liefert Genre -> Serienliste.""" catalog: Dict[str, List[SeriesResult]] = {} + def _normalize_genre_label(raw: str) -> str: + text = re.sub(r"\s+", " ", str(raw or "")).strip() + if not text: + return "" + key_prefix = "filter.genre_" + if text.casefold().startswith(key_prefix): + slug = text[len(key_prefix) :].strip().casefold() + slug = slug.replace("_", "-") + slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-") + if not slug: + return "" + special = { + "doku-soap": "Doku-Soap", + "scifi": "SciFi", + "fighting-shounen": "Fighting-Shounen", + } + if slug in special: + return special[slug] + return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk) + return text + # Neues Layout (Stand: 2026-01): Gruppen-Header + Liste. # - Header: `div.background-1 ...` mit `h3` # - Einträge: `ul.series-list` -> `li.series-item[data-search]` -> `a[href]` for header in soup.select("div.background-1 h3"): - group = (header.get_text(strip=True) or "").strip() + group = _normalize_genre_label(header.get_text(strip=True)) if not group: continue list_node = header.parent.find_next_sibling("ul", class_="series-list")