mindnet/app/core/graph/graph_extractors.py

"""
FILE: app/core/graph/graph_extractors.py
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
"""
import re
from typing import List, Tuple

_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
_REL_PIPE  = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]",   re.IGNORECASE)
_REL_TEXT  = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)

_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
_REL_LINE      = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")

def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
    """Extrahiert [[rel:KIND|Target]]."""
    pairs = []
    def _collect(m):
        k, t = (m.group("kind") or "").strip().lower(), (m.group("target") or "").strip()
        if k and t: pairs.append((k, t))
        return ""
    text = _REL_PIPE.sub(_collect, text)
    text = _REL_SPACE.sub(_collect, text)
    text = _REL_TEXT.sub(_collect, text)
    return pairs, text

def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
    """Verarbeitet Obsidian [!edge]-Callouts."""
    if not text: return [], text
    lines = text.splitlines(); out_pairs, keep_lines, i = [], [], 0
    while i < len(lines):
        m = _CALLOUT_START.match(lines[i])
        if not m:
            keep_lines.append(lines[i]); i += 1; continue
        block_lines = [m.group(1)] if m.group(1).strip() else []
        i += 1
        while i < len(lines) and lines[i].lstrip().startswith('>'):
            block_lines.append(lines[i].lstrip()[1:].lstrip()); i += 1
        for bl in block_lines:
            mrel = _REL_LINE.match(bl)
            if not mrel: continue
            kind, targets = mrel.group("kind").strip().lower(), mrel.group("targets") or ""
            found = _WIKILINKS_IN_LINE.findall(targets)
            if found:
                for t in found: out_pairs.append((kind, t.strip()))
            else:
                for raw in re.split(r"[,;]", targets):
                    if raw.strip(): out_pairs.append((kind, raw.strip()))
    return out_pairs, "\n".join(keep_lines)

def extract_wikilinks(text: str) -> List[str]:
    """Extrahiert Standard-Wikilinks."""
    return [m.group(1).strip() for m in _WIKILINK_RE.finditer(text or "")]