""" FILE: app/core/graph/graph_extractors.py DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text. AUDIT: Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen Support). """ import re from typing import List, Tuple # Fix: Erlaube alle Zeichen außer ']' im Target, statt nur a-z0-9. # Das fängt Umlaute, Emojis, '&', '#' und Leerzeichen ab. _WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]") _REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_TEXT = re.compile(r"rel\s*:\s*(?P[a-z_]+)\s*\[\[\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) _CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) _REL_LINE = re.compile(r"^(?P[a-z_]+)\s*:\s*(?P.+?)\s*$", re.IGNORECASE) _WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]: """ Findet Inline-Relationen wie [[rel:depends_on Target]]. Gibt (Liste[(kind, target)], bereinigter_text) zurück. """ if not text: return [], "" pairs = [] def _collect(m): k, t = m.group("kind").strip().lower(), m.group("target").strip() pairs.append((k, t)) return "" text = _REL_PIPE.sub(_collect, text) text = _REL_SPACE.sub(_collect, text) text = _REL_TEXT.sub(_collect, text) return pairs, text def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: """Verarbeitet Obsidian [!edge]-Callouts.""" if not text: return [], text lines = text.splitlines(); out_pairs, keep_lines, i = [], [], 0 while i < len(lines): m = _CALLOUT_START.match(lines[i]) if not m: keep_lines.append(lines[i]); i += 1; continue block_lines = [m.group(1)] if m.group(1).strip() else [] i += 1 while i < len(lines) and lines[i].lstrip().startswith('>'): block_lines.append(lines[i].lstrip()[1:].lstrip()); i += 1 for bl in block_lines: mrel = _REL_LINE.match(bl) if not mrel: continue kind, targets = mrel.group("kind").strip().lower(), mrel.group("targets") or "" found = _WIKILINKS_IN_LINE.findall(targets) if found: for t in found: out_pairs.append((kind, t.strip())) return out_pairs, "\n".join(keep_lines) def extract_wikilinks(text: str) -> List[str]: """Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]].""" if not text: return [] # match.group(1) ist jetzt das Target (dank des fixierten Regex) return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()]