mindnet/app/core/chunking/chunking_parser.py

"""
FILE: app/core/chunking/chunking_parser.py
DESCRIPTION: Zerlegt Markdown in atomare Blöcke. Hält H1-Überschriften im Stream
             und gewährleistet die strukturelle Integrität von Callouts.
"""
import re
from typing import List, Tuple, Set
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text

def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6."""
    blocks = []
    h1_title = "Dokument"; section_path = "/"; current_section_title = None
    fm, text_without_fm = extract_frontmatter_from_text(md_text)

    # H1 für Metadaten extrahieren
    h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
    if h1_match: h1_title = h1_match.group(1).strip()

    lines = text_without_fm.split('\n')
    buffer = []

    for line in lines:
        stripped = line.strip()
        heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)

        if heading_match:
            # Vorherigen Text-Block abschließen
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
                buffer = []

            level = len(heading_match.group(1))
            title = heading_match.group(2).strip()

            # Pfad- und Titel-Update
            if level == 1:
                current_section_title = title; section_path = "/"
            elif level == 2:
                current_section_title = title; section_path = f"/{current_section_title}"

            blocks.append(RawBlock("heading", stripped, level, section_path, current_section_title))
            continue

        # Trenner oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts
        if (not stripped or stripped == "---") and not line.startswith('>'):
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
                buffer = []
            if stripped == "---":
                blocks.append(RawBlock("separator", "---", None, section_path, current_section_title))
        else:
            buffer.append(line)

    if buffer:
        content = "\n".join(buffer).strip()
        if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))

    return blocks, h1_title

def split_sentences(text: str) -> list[str]:
    """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
    text = re.sub(r'\s+', ' ', text.strip())
    if not text: return []
    # Splittet bei Satzzeichen, gefolgt von Leerzeichen und Großbuchstaben
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])', text) if s.strip()]