mindnet/app/core/chunking/chunking_parser.py

"""
FILE: app/core/chunking/chunking_parser.py
DESCRIPTION: Zerlegt Markdown in Blöcke und extrahiert Kanten-Strings.
"""
import re
from typing import List, Tuple, Set
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text

_WS = re.compile(r'\s+')
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')

def split_sentences(text: str) -> list[str]:
    """Teilt Text in Sätze auf."""
    text = _WS.sub(' ', text.strip())
    if not text: return []
    return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]

def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """Zerlegt Text in logische Einheiten."""
    blocks = []
    h1_title = "Dokument"; section_path = "/"; current_h2 = None
    fm, text_without_fm = extract_frontmatter_from_text(md_text)
    h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
    if h1_match: h1_title = h1_match.group(1).strip()
    lines = text_without_fm.split('\n')
    buffer = []

    for line in lines:
        stripped = line.strip()

        # H1 ignorieren (ist Doc Title)
        if stripped.startswith('# '):
            continue

        # Generische Heading-Erkennung (H2 bis H6) für flexible Split-Levels
        heading_match = re.match(r'^(#{2,6})\s+(.*)', stripped)
        if heading_match:
            # Buffer leeren (vorherigen Text abschließen)
            if buffer:
                content = "\n".join(buffer).strip()
                if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
                buffer = []

            level = len(heading_match.group(1))
            title = heading_match.group(2).strip()

            # Pfad-Logik: H2 setzt den Haupt-Pfad
            if level == 2:
                current_h2 = title
                section_path = f"/{current_h2}"
            # Bei H3+ bleibt der section_path beim Parent, aber das Level wird korrekt gesetzt

            blocks.append(RawBlock("heading", stripped, level, section_path, current_h2))

        elif not stripped:
            if buffer:
                content = "\n".join(buffer).strip()
                if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
                buffer = []
        else:
            buffer.append(line)

    if buffer:
        content = "\n".join(buffer).strip()
        if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
    return blocks, h1_title

def parse_edges_robust(text: str) -> Set[str]:
    """Extrahiert Kanten-Kandidaten (Wikilinks, Callouts)."""
    found_edges = set()
    inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
    for kind, target in inlines:
        k = kind.strip().lower()
        t = target.strip()
        if k and t: found_edges.add(f"{k}:{t}")
    lines = text.split('\n')
    current_edge_type = None
    for line in lines:
        stripped = line.strip()
        callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
        if callout_match:
            current_edge_type = callout_match.group(1).strip().lower()
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links:
                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
            continue
        if current_edge_type and stripped.startswith('>'):
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links:
                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
        elif not stripped.startswith('>'): current_edge_type = None
    return found_edges