dev: normalize filter.genre_* labels in genre parsing

This commit is contained in:
2026-02-24 18:50:31 +01:00
parent 16e4b5f261
commit 76b04ddaf2
2 changed files with 68 additions and 4 deletions

View File

@@ -468,6 +468,27 @@ def _get_soup_simple(url: str) -> BeautifulSoupT:
def _extract_genre_names_from_html(body: str) -> List[str]:
def _normalize_genre_label(raw: str) -> str:
text = unescape(re.sub(r"\s+", " ", str(raw or ""))).strip()
if not text:
return ""
key_prefix = "filter.genre_"
if text.casefold().startswith(key_prefix):
slug = text[len(key_prefix) :].strip().casefold()
slug = slug.replace("_", "-")
slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-")
if not slug:
return ""
special = {
"doku-soap": "Doku-Soap",
"scifi": "SciFi",
"fighting-shounen": "Fighting-Shounen",
}
if slug in special:
return special[slug]
return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk)
return text
names: List[str] = []
seen: set[str] = set()
pattern = re.compile(
@@ -476,7 +497,7 @@ def _extract_genre_names_from_html(body: str) -> List[str]:
)
for match in pattern.finditer(body or ""):
text = re.sub(r"<[^>]+>", " ", match.group(1) or "")
text = unescape(re.sub(r"\s+", " ", text)).strip()
text = _normalize_genre_label(text)
if not text:
continue
key = text.casefold()
@@ -726,11 +747,32 @@ def parse_series_catalog(soup: BeautifulSoupT) -> Dict[str, List[SeriesResult]]:
"""Parst die Serien-Übersicht (/serien) und liefert Genre -> Serienliste."""
catalog: Dict[str, List[SeriesResult]] = {}
def _normalize_genre_label(raw: str) -> str:
text = re.sub(r"\s+", " ", str(raw or "")).strip()
if not text:
return ""
key_prefix = "filter.genre_"
if text.casefold().startswith(key_prefix):
slug = text[len(key_prefix) :].strip().casefold()
slug = slug.replace("_", "-")
slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-")
if not slug:
return ""
special = {
"doku-soap": "Doku-Soap",
"scifi": "SciFi",
"fighting-shounen": "Fighting-Shounen",
}
if slug in special:
return special[slug]
return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk)
return text
# Neues Layout (Stand: 2026-01): Gruppen-Header + Liste.
# - Header: `div.background-1 ...` mit `h3`
# - Einträge: `ul.series-list` -> `li.series-item[data-search]` -> `a[href]`
for header in soup.select("div.background-1 h3"):
group = (header.get_text(strip=True) or "").strip()
group = _normalize_genre_label(header.get_text(strip=True))
if not group:
continue
list_node = header.parent.find_next_sibling("ul", class_="series-list")