""" FILE: app/core/chunking/chunking_parser.py DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks). Hält alle Überschriftenebenen (H1-H6) im Stream. Stellt die Funktion parse_edges_robust zur Verfügung. WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss. """ import re import os from typing import List, Tuple, Set from .chunking_models import RawBlock from .chunking_utils import extract_frontmatter_from_text _WS = re.compile(r'\s+') _SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])') def split_sentences(text: str) -> list[str]: """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion.""" text = _WS.sub(' ', text.strip()) if not text: return [] # Splittet bei Punkt, Ausrufezeichen oder Fragezeichen, gefolgt von Leerzeichen und Großbuchstabe return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: """ Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6. WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss. """ blocks = [] h1_title = "Dokument" section_path = "/" current_section_title = None # Frontmatter entfernen fm, text_without_fm = extract_frontmatter_from_text(md_text) # WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebenen llm_validation_headers = os.getenv( "MINDNET_LLM_VALIDATION_HEADERS", "Unzugeordnete Kanten,Edge Pool,Candidates" ) llm_validation_header_list = [h.strip() for h in llm_validation_headers.split(",") if h.strip()] if not llm_validation_header_list: llm_validation_header_list = ["Unzugeordnete Kanten", "Edge Pool", "Candidates"] note_scope_headers = os.getenv( "MINDNET_NOTE_SCOPE_ZONE_HEADERS", "Smart Edges,Relationen,Global Links,Note-Level Relations,Globale Verbindungen" ) note_scope_header_list = [h.strip() for h in note_scope_headers.split(",") if h.strip()] if not note_scope_header_list: note_scope_header_list = ["Smart Edges", "Relationen", "Global Links", "Note-Level Relations", "Globale Verbindungen"] # Header-Ebenen konfigurierbar (Default: LLM=3, Note-Scope=2) llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3")) note_scope_level = int(os.getenv("MINDNET_NOTE_SCOPE_HEADER_LEVEL", "2")) # Status-Tracking für Edge-Zonen in_exclusion_zone = False exclusion_zone_type = None # "llm_validation" oder "note_scope" # H1 für Note-Titel extrahieren (Metadaten-Zweck) h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE) if h1_match: h1_title = h1_match.group(1).strip() lines = text_without_fm.split('\n') buffer = [] for line in lines: stripped = line.strip() # Heading-Erkennung (H1 bis H6) heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped) if heading_match: # Vorherigen Text-Block abschließen if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock( "paragraph", content, None, section_path, current_section_title, exclude_from_chunking=in_exclusion_zone )) buffer = [] level = len(heading_match.group(1)) title = heading_match.group(2).strip() # WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet is_llm_validation_zone = ( level == llm_validation_level and any(title.lower() == h.lower() for h in llm_validation_header_list) ) is_note_scope_zone = ( level == note_scope_level and any(title.lower() == h.lower() for h in note_scope_header_list) ) if is_llm_validation_zone: in_exclusion_zone = True exclusion_zone_type = "llm_validation" elif is_note_scope_zone: in_exclusion_zone = True exclusion_zone_type = "note_scope" elif in_exclusion_zone: # Neuer Header gefunden, der keine Edge-Zone ist -> Zone beendet in_exclusion_zone = False exclusion_zone_type = None # Pfad- und Titel-Update für die Metadaten der folgenden Blöcke if level == 1: current_section_title = title; section_path = "/" elif level == 2: current_section_title = title; section_path = f"/{current_section_title}" # Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone) blocks.append(RawBlock( "heading", stripped, level, section_path, current_section_title, exclude_from_chunking=in_exclusion_zone )) continue # Trenner (---) oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts if (not stripped or stripped == "---") and not line.startswith('>'): if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock( "paragraph", content, None, section_path, current_section_title, exclude_from_chunking=in_exclusion_zone )) buffer = [] if stripped == "---": blocks.append(RawBlock( "separator", "---", None, section_path, current_section_title, exclude_from_chunking=in_exclusion_zone )) else: buffer.append(line) if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock( "paragraph", content, None, section_path, current_section_title, exclude_from_chunking=in_exclusion_zone )) return blocks, h1_title def parse_edges_robust(text: str) -> Set[str]: """Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.""" found_edges = set() # 1. Wikilinks [[rel:kind|target]] inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text) for kind, target in inlines: k = kind.strip().lower() t = target.strip() if k and t: found_edges.add(f"{k}:{t}") # 2. Callout Edges > [!edge] kind lines = text.split('\n') current_edge_type = None for line in lines: stripped = line.strip() callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped) if callout_match: current_edge_type = callout_match.group(1).strip().lower() # Links in der gleichen Zeile des Callouts links = re.findall(r'\[\[([^\]]+)\]\]', stripped) for l in links: if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") continue # Links in Folgezeilen des Callouts if current_edge_type and stripped.startswith('>'): links = re.findall(r'\[\[([^\]]+)\]\]', stripped) for l in links: if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") elif not stripped.startswith('>'): current_edge_type = None return found_edges