mindnet/app/core/chunking/chunking_parser.py

"""
FILE: app/core/chunking/chunking_parser.py
DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
             Hält alle Überschriftenebenen (H1-H6) im Stream.
             Stellt die Funktion parse_edges_robust zur Verfügung.
             WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
             WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
             WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
"""
import re
import os
import logging
from typing import List, Tuple, Set, Dict, Any, Optional
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text

logger = logging.getLogger(__name__)

_WS = re.compile(r'\s+')
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')

# WP-26 v1.0: Pattern für [!section]-Callouts
# Matches: > [!section] type-name
_SECTION_CALLOUT_PATTERN = re.compile(r'^\s*>\s*\[!section\]\s*(\w+)', re.IGNORECASE)

# WP-26 v1.0: Pattern für Block-IDs in Überschriften
# Matches: ## Titel ^block-id
_BLOCK_ID_PATTERN = re.compile(r'\^([a-zA-Z0-9_-]+)\s*$')

def split_sentences(text: str) -> list[str]:
    """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
    text = _WS.sub(' ', text.strip())
    if not text: return []
    # Splittet bei Punkt, Ausrufezeichen oder Fragezeichen, gefolgt von Leerzeichen und Großbuchstabe
    return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]

def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """
    Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
    WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
    WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
    WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
    """
    blocks = []
    h1_title = "Dokument"
    section_path = "/"
    current_section_title = None

    # WP-26 v1.0: State-Machine für Section-Type-Tracking
    current_section_type: Optional[str] = None  # Aktueller Section-Type (oder None für note_type Fallback)
    section_introduced_at_level: Optional[int] = None  # Ebene, auf der erste Section eingeführt wurde
    current_block_id: Optional[str] = None  # Block-ID der aktuellen Sektion

    # Frontmatter entfernen
    fm, text_without_fm = extract_frontmatter_from_text(md_text)

    # WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebenen
    llm_validation_headers = os.getenv(
        "MINDNET_LLM_VALIDATION_HEADERS",
        "Unzugeordnete Kanten,Edge Pool,Candidates"
    )
    llm_validation_header_list = [h.strip() for h in llm_validation_headers.split(",") if h.strip()]
    if not llm_validation_header_list:
        llm_validation_header_list = ["Unzugeordnete Kanten", "Edge Pool", "Candidates"]

    note_scope_headers = os.getenv(
        "MINDNET_NOTE_SCOPE_ZONE_HEADERS",
        "Smart Edges,Relationen,Global Links,Note-Level Relations,Globale Verbindungen"
    )
    note_scope_header_list = [h.strip() for h in note_scope_headers.split(",") if h.strip()]
    if not note_scope_header_list:
        note_scope_header_list = ["Smart Edges", "Relationen", "Global Links", "Note-Level Relations", "Globale Verbindungen"]

    # Header-Ebenen konfigurierbar (Default: LLM=3, Note-Scope=2)
    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
    note_scope_level = int(os.getenv("MINDNET_NOTE_SCOPE_HEADER_LEVEL", "2"))

    # Status-Tracking für Edge-Zonen
    in_exclusion_zone = False
    exclusion_zone_type = None  # "llm_validation" oder "note_scope"

    # H1 für Note-Titel extrahieren (Metadaten-Zweck)
    h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
    if h1_match:
        h1_title = h1_match.group(1).strip()

    lines = text_without_fm.split('\n')
    buffer = []

    # WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
    # WP-26 v1.0: Erweitert um [!section]-Callouts
    # Regex für Callouts: >\s*[!edge], >\s*[!abstract], >\s*[!section] (auch mit mehreren >)
    callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract|section)\]', re.IGNORECASE)

    # WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
    processed_indices = set()

    for i, line in enumerate(lines):
        if i in processed_indices:
            continue

        stripped = line.strip()

        # WP-24c v4.2.5: Callout-Erkennung (VOR Heading-Erkennung)
        # Prüfe, ob diese Zeile ein Callout startet
        callout_match = callout_pattern.match(line)
        if callout_match:
            callout_type = callout_match.group(1).lower()  # "edge", "abstract", oder "section"

            # WP-26 v1.0: [!section] Callout-Behandlung
            if callout_type == "section":
                # Extrahiere Section-Type aus dem Callout
                section_match = _SECTION_CALLOUT_PATTERN.match(line)
                if section_match:
                    new_section_type = section_match.group(1).lower()
                    current_section_type = new_section_type

                    # Tracke die Ebene, auf der die erste Section eingeführt wurde
                    # Wir nehmen die Ebene der letzten Überschrift (section_path basiert)
                    if section_introduced_at_level is None:
                        # Bestimme Ebene aus section_path
                        # "/" = H1, "/Title" = H2, "/Title/Sub" = H3, etc.
                        path_depth = section_path.count('/') if section_path else 1
                        section_introduced_at_level = max(1, path_depth + 1)

                    logger.debug(f"WP-26: Section-Type erkannt: '{new_section_type}' bei '{current_section_title}' (Level: {section_introduced_at_level})")

                # [!section] Callout wird nicht als Block hinzugefügt (ist nur Metadaten)
                processed_indices.add(i)
                continue

            # Vorherigen Text-Block abschließen
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock(
                        "paragraph", content, None, section_path, current_section_title,
                        exclude_from_chunking=in_exclusion_zone,
                        section_type=current_section_type,
                        block_id=current_block_id
                    ))
                buffer = []

            # Sammle alle Zeilen des Callout-Blocks
            callout_lines = [line]
            leading_gt_count = len(line) - len(line.lstrip('>'))
            processed_indices.add(i)

            # Sammle alle Zeilen, die zum Callout gehören (gleiche oder höhere Einrückung)
            j = i + 1
            while j < len(lines):
                next_line = lines[j]
                if not next_line.strip().startswith('>'):
                    break
                next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
                if next_leading_gt < leading_gt_count:
                    break
                callout_lines.append(next_line)
                processed_indices.add(j)
                j += 1

            # WP-24c v4.2.6: Erstelle Callout-Block mit is_meta_content = True
            # Callouts werden gechunkt (für Chunk-Attribution), aber später entfernt (Clean-Context)
            callout_content = "\n".join(callout_lines)
            blocks.append(RawBlock(
                "callout", callout_content, None, section_path, current_section_title,
                exclude_from_chunking=in_exclusion_zone,  # Nur Edge-Zonen werden ausgeschlossen
                is_meta_content=True,  # WP-24c v4.2.6: Markierung für spätere Entfernung
                section_type=current_section_type,
                block_id=current_block_id
            ))
            continue

        # Heading-Erkennung (H1 bis H6)
        heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
        if heading_match:
            # Vorherigen Text-Block abschließen
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock(
                        "paragraph", content, None, section_path, current_section_title,
                        exclude_from_chunking=in_exclusion_zone,
                        section_type=current_section_type,
                        block_id=current_block_id
                    ))
                buffer = []

            level = len(heading_match.group(1))
            title = heading_match.group(2).strip()

            # WP-26 v1.0: Block-ID aus Überschrift extrahieren (z.B. "## Titel ^block-id")
            block_id_match = _BLOCK_ID_PATTERN.search(title)
            if block_id_match:
                current_block_id = block_id_match.group(1)
                # Entferne Block-ID aus dem Titel für saubere Anzeige
                title = _BLOCK_ID_PATTERN.sub('', title).strip()
            else:
                current_block_id = None

            # WP-26 v1.0: Section-Type State-Machine
            # Wenn eine Section eingeführt wurde und wir auf gleicher oder höherer Ebene sind:
            # -> Automatisch neue Section erkennen (FA-02b)
            if section_introduced_at_level is not None and level <= section_introduced_at_level:
                # Neue Überschrift auf gleicher oder höherer Ebene -> Reset auf None (note_type Fallback)
                current_section_type = None
                logger.debug(f"WP-26: Neue Section erkannt bei H{level} '{title}' -> Reset auf note_type")

            # WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
            is_llm_validation_zone = (
                level == llm_validation_level and
                any(title.lower() == h.lower() for h in llm_validation_header_list)
            )
            is_note_scope_zone = (
                level == note_scope_level and
                any(title.lower() == h.lower() for h in note_scope_header_list)
            )

            if is_llm_validation_zone:
                in_exclusion_zone = True
                exclusion_zone_type = "llm_validation"
            elif is_note_scope_zone:
                in_exclusion_zone = True
                exclusion_zone_type = "note_scope"
            elif in_exclusion_zone:
                # Neuer Header gefunden, der keine Edge-Zone ist -> Zone beendet
                in_exclusion_zone = False
                exclusion_zone_type = None

            # Pfad- und Titel-Update für die Metadaten der folgenden Blöcke
            if level == 1:
                current_section_title = title; section_path = "/"
            elif level == 2:
                current_section_title = title; section_path = f"/{current_section_title}"

            # Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
            blocks.append(RawBlock(
                "heading", stripped, level, section_path, current_section_title,
                exclude_from_chunking=in_exclusion_zone,
                section_type=current_section_type,
                block_id=current_block_id
            ))
            continue

        # Trenner (---) oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts
        if (not stripped or stripped == "---") and not line.startswith('>'):
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock(
                        "paragraph", content, None, section_path, current_section_title,
                        exclude_from_chunking=in_exclusion_zone,
                        section_type=current_section_type,
                        block_id=current_block_id
                    ))
                buffer = []
            if stripped == "---":
                blocks.append(RawBlock(
                    "separator", "---", None, section_path, current_section_title,
                    exclude_from_chunking=in_exclusion_zone,
                    section_type=current_section_type,
                    block_id=current_block_id
                ))
        else:
            buffer.append(line)

    if buffer:
        content = "\n".join(buffer).strip()
        if content:
            blocks.append(RawBlock(
                "paragraph", content, None, section_path, current_section_title,
                exclude_from_chunking=in_exclusion_zone,
                section_type=current_section_type,
                block_id=current_block_id
            ))

    # WP-26 v1.3: Post-Processing - Section-Type rückwirkend setzen
    # Der [!section] Callout kann IRGENDWO im Abschnitt stehen und gilt rückwirkend
    # für die gesamte Heading-Sektion (vom Heading bis zum nächsten Heading gleicher/höherer Ebene)
    blocks = _propagate_section_type_backwards(blocks, split_level=2)

    return blocks, h1_title


def _propagate_section_type_backwards(blocks: List[RawBlock], split_level: int = 2) -> List[RawBlock]:
    """
    WP-26 v1.3: Propagiert section_type rückwirkend für Heading-Sektionen.

    Der [!section] Callout kann irgendwo im Abschnitt stehen (nicht nur direkt nach dem Heading).
    Diese Funktion findet den section_type innerhalb einer Heading-Sektion und setzt ihn
    rückwirkend für ALLE Blöcke dieser Sektion (inklusive dem Heading selbst).

    Args:
        blocks: Liste von RawBlock-Objekten
        split_level: Heading-Ebene, die eine neue Sektion startet (Standard: 2 für H2)

    Returns:
        Liste von RawBlock-Objekten mit korrigiertem section_type
    """
    if not blocks:
        return blocks

    # Gruppiere Blöcke nach Heading-Sektionen
    sections: List[List[int]] = []  # Liste von Index-Listen
    current_section_indices: List[int] = []

    for idx, block in enumerate(blocks):
        if block.kind == "heading" and block.level is not None and block.level <= split_level:
            # Neues Heading startet neue Sektion
            if current_section_indices:
                sections.append(current_section_indices)
            current_section_indices = [idx]
        else:
            current_section_indices.append(idx)

    # Letzte Sektion hinzufügen
    if current_section_indices:
        sections.append(current_section_indices)

    # Für jede Sektion: Finde den section_type und setze ihn rückwirkend
    for section_indices in sections:
        # Finde den section_type innerhalb dieser Sektion
        section_type_found = None
        for idx in section_indices:
            if blocks[idx].section_type:
                section_type_found = blocks[idx].section_type
                break  # Erster gefundener section_type gewinnt

        # Wenn ein section_type gefunden wurde, setze ihn für alle Blöcke der Sektion
        if section_type_found:
            for idx in section_indices:
                blocks[idx].section_type = section_type_found

    return blocks

def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
    """
    Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
    WP-24c v4.2.7: Gibt Liste von Dicts zurück mit is_callout Flag für Chunk-Attribution.
    WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten,
    damit alle Links in einem Callout-Block korrekt verarbeitet werden.

    Returns:
        List[Dict] mit keys: "edge" (str: "kind:target"), "is_callout" (bool)
    """
    found_edges: List[Dict[str, any]] = []
    # 1. Wikilinks [[rel:kind|target]]
    inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
    for kind, target in inlines:
        k = kind.strip().lower()
        t = target.strip()
        if k and t:
            found_edges.append({"edge": f"{k}:{t}", "is_callout": False})

    # 2. Callout Edges > [!edge] kind
    lines = text.split('\n')
    current_edge_type = None
    for line in lines:
        stripped = line.strip()
        callout_match = re.match(r'>+\s*\[!edge\]\s*([^:\s]+)', stripped)
        if callout_match:
            current_edge_type = callout_match.group(1).strip().lower()
            # Links in der gleichen Zeile des Callouts
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links:
                if "rel:" not in l:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
            continue
        # Links in Folgezeilen des Callouts
        # WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten
        # innerhalb eines Callout-Blocks, damit alle Links korrekt verarbeitet werden
        if current_edge_type and stripped.startswith('>'):
            # Fortsetzung des Callout-Blocks: Links extrahieren
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links:
                if "rel:" not in l:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
        elif current_edge_type and not stripped.startswith('>') and stripped:
            # Nicht-Callout-Zeile mit Inhalt: Callout-Block beendet
            current_edge_type = None
        # Leerzeilen werden ignoriert - current_edge_type bleibt erhalten
    return found_edges