dev: normalize filter.genre_* labels in genre parsing
This commit is contained in:
@@ -357,6 +357,28 @@ def _get_soup_simple(url: str) -> BeautifulSoupT:
|
|||||||
return BeautifulSoup(body, "html.parser")
|
return BeautifulSoup(body, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_genre_label(raw: str) -> str:
|
||||||
|
text = unescape(re.sub(r"\s+", " ", str(raw or ""))).strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
key_prefix = "filter.genre_"
|
||||||
|
if text.casefold().startswith(key_prefix):
|
||||||
|
slug = text[len(key_prefix) :].strip().casefold()
|
||||||
|
slug = slug.replace("_", "-")
|
||||||
|
slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-")
|
||||||
|
if not slug:
|
||||||
|
return ""
|
||||||
|
special = {
|
||||||
|
"doku-soap": "Doku-Soap",
|
||||||
|
"scifi": "SciFi",
|
||||||
|
"fighting-shounen": "Fighting-Shounen",
|
||||||
|
}
|
||||||
|
if slug in special:
|
||||||
|
return special[slug]
|
||||||
|
return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def _extract_genre_names_from_html(body: str) -> List[str]:
|
def _extract_genre_names_from_html(body: str) -> List[str]:
|
||||||
names: List[str] = []
|
names: List[str] = []
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
@@ -366,7 +388,7 @@ def _extract_genre_names_from_html(body: str) -> List[str]:
|
|||||||
)
|
)
|
||||||
for match in pattern.finditer(body or ""):
|
for match in pattern.finditer(body or ""):
|
||||||
text = re.sub(r"<[^>]+>", " ", match.group(1) or "")
|
text = re.sub(r"<[^>]+>", " ", match.group(1) or "")
|
||||||
text = unescape(re.sub(r"\s+", " ", text)).strip()
|
text = _normalize_genre_label(text)
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
key = text.casefold()
|
key = text.casefold()
|
||||||
@@ -1193,7 +1215,7 @@ class AniworldPlugin(BasisPlugin):
|
|||||||
genre_blocks = soup.select("div.genre")
|
genre_blocks = soup.select("div.genre")
|
||||||
for genre_block in genre_blocks:
|
for genre_block in genre_blocks:
|
||||||
name_node = genre_block.select_one(".seriesGenreList h3")
|
name_node = genre_block.select_one(".seriesGenreList h3")
|
||||||
genre_name = (name_node.get_text(" ", strip=True) if name_node else "").strip()
|
genre_name = _normalize_genre_label(name_node.get_text(" ", strip=True) if name_node else "")
|
||||||
if not genre_name:
|
if not genre_name:
|
||||||
continue
|
continue
|
||||||
entries: List[SeriesResult] = []
|
entries: List[SeriesResult] = []
|
||||||
|
|||||||
@@ -468,6 +468,27 @@ def _get_soup_simple(url: str) -> BeautifulSoupT:
|
|||||||
|
|
||||||
|
|
||||||
def _extract_genre_names_from_html(body: str) -> List[str]:
|
def _extract_genre_names_from_html(body: str) -> List[str]:
|
||||||
|
def _normalize_genre_label(raw: str) -> str:
|
||||||
|
text = unescape(re.sub(r"\s+", " ", str(raw or ""))).strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
key_prefix = "filter.genre_"
|
||||||
|
if text.casefold().startswith(key_prefix):
|
||||||
|
slug = text[len(key_prefix) :].strip().casefold()
|
||||||
|
slug = slug.replace("_", "-")
|
||||||
|
slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-")
|
||||||
|
if not slug:
|
||||||
|
return ""
|
||||||
|
special = {
|
||||||
|
"doku-soap": "Doku-Soap",
|
||||||
|
"scifi": "SciFi",
|
||||||
|
"fighting-shounen": "Fighting-Shounen",
|
||||||
|
}
|
||||||
|
if slug in special:
|
||||||
|
return special[slug]
|
||||||
|
return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk)
|
||||||
|
return text
|
||||||
|
|
||||||
names: List[str] = []
|
names: List[str] = []
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
pattern = re.compile(
|
pattern = re.compile(
|
||||||
@@ -476,7 +497,7 @@ def _extract_genre_names_from_html(body: str) -> List[str]:
|
|||||||
)
|
)
|
||||||
for match in pattern.finditer(body or ""):
|
for match in pattern.finditer(body or ""):
|
||||||
text = re.sub(r"<[^>]+>", " ", match.group(1) or "")
|
text = re.sub(r"<[^>]+>", " ", match.group(1) or "")
|
||||||
text = unescape(re.sub(r"\s+", " ", text)).strip()
|
text = _normalize_genre_label(text)
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
key = text.casefold()
|
key = text.casefold()
|
||||||
@@ -726,11 +747,32 @@ def parse_series_catalog(soup: BeautifulSoupT) -> Dict[str, List[SeriesResult]]:
|
|||||||
"""Parst die Serien-Übersicht (/serien) und liefert Genre -> Serienliste."""
|
"""Parst die Serien-Übersicht (/serien) und liefert Genre -> Serienliste."""
|
||||||
catalog: Dict[str, List[SeriesResult]] = {}
|
catalog: Dict[str, List[SeriesResult]] = {}
|
||||||
|
|
||||||
|
def _normalize_genre_label(raw: str) -> str:
|
||||||
|
text = re.sub(r"\s+", " ", str(raw or "")).strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
key_prefix = "filter.genre_"
|
||||||
|
if text.casefold().startswith(key_prefix):
|
||||||
|
slug = text[len(key_prefix) :].strip().casefold()
|
||||||
|
slug = slug.replace("_", "-")
|
||||||
|
slug = re.sub(r"[^a-z0-9-]+", "-", slug).strip("-")
|
||||||
|
if not slug:
|
||||||
|
return ""
|
||||||
|
special = {
|
||||||
|
"doku-soap": "Doku-Soap",
|
||||||
|
"scifi": "SciFi",
|
||||||
|
"fighting-shounen": "Fighting-Shounen",
|
||||||
|
}
|
||||||
|
if slug in special:
|
||||||
|
return special[slug]
|
||||||
|
return " ".join(chunk.capitalize() for chunk in slug.split("-") if chunk)
|
||||||
|
return text
|
||||||
|
|
||||||
# Neues Layout (Stand: 2026-01): Gruppen-Header + Liste.
|
# Neues Layout (Stand: 2026-01): Gruppen-Header + Liste.
|
||||||
# - Header: `div.background-1 ...` mit `h3`
|
# - Header: `div.background-1 ...` mit `h3`
|
||||||
# - Einträge: `ul.series-list` -> `li.series-item[data-search]` -> `a[href]`
|
# - Einträge: `ul.series-list` -> `li.series-item[data-search]` -> `a[href]`
|
||||||
for header in soup.select("div.background-1 h3"):
|
for header in soup.select("div.background-1 h3"):
|
||||||
group = (header.get_text(strip=True) or "").strip()
|
group = _normalize_genre_label(header.get_text(strip=True))
|
||||||
if not group:
|
if not group:
|
||||||
continue
|
continue
|
||||||
list_node = header.parent.find_next_sibling("ul", class_="series-list")
|
list_node = header.parent.find_next_sibling("ul", class_="series-list")
|
||||||
|
|||||||
Reference in New Issue
Block a user