diff --git a/app/core/graph/graph_extractors.py b/app/core/graph/graph_extractors.py index 9c1fedf..70d5ae5 100644 --- a/app/core/graph/graph_extractors.py +++ b/app/core/graph/graph_extractors.py @@ -1,11 +1,15 @@ """ FILE: app/core/graph/graph_extractors.py DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text. + AUDIT: Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen Support). """ import re from typing import List, Tuple -_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") +# Fix: Erlaube alle Zeichen außer ']' im Target, statt nur a-z0-9. +# Das fängt Umlaute, Emojis, '&', '#' und Leerzeichen ab. +_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]") + _REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_TEXT = re.compile(r"rel\s*:\s*(?P[a-z_]+)\s*\[\[\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) @@ -14,12 +18,16 @@ _CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) _REL_LINE = re.compile(r"^(?P[a-z_]+)\s*:\s*(?P.+?)\s*$", re.IGNORECASE) _WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") -def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: - """Extrahiert [[rel:KIND|Target]].""" +def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]: + """ + Findet Inline-Relationen wie [[rel:depends_on Target]]. + Gibt (Liste[(kind, target)], bereinigter_text) zurück. + """ + if not text: return [], "" pairs = [] def _collect(m): - k, t = (m.group("kind") or "").strip().lower(), (m.group("target") or "").strip() - if k and t: pairs.append((k, t)) + k, t = m.group("kind").strip().lower(), m.group("target").strip() + pairs.append((k, t)) return "" text = _REL_PIPE.sub(_collect, text) text = _REL_SPACE.sub(_collect, text) @@ -45,11 +53,10 @@ def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: found = _WIKILINKS_IN_LINE.findall(targets) if found: for t in found: out_pairs.append((kind, t.strip())) - else: - for raw in re.split(r"[,;]", targets): - if raw.strip(): out_pairs.append((kind, raw.strip())) return out_pairs, "\n".join(keep_lines) def extract_wikilinks(text: str) -> List[str]: - """Extrahiert Standard-Wikilinks.""" - return [m.group(1).strip() for m in _WIKILINK_RE.finditer(text or "")] \ No newline at end of file + """Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]].""" + if not text: return [] + # match.group(1) ist jetzt das Target (dank des fixierten Regex) + return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()] \ No newline at end of file