""" FILE: app/core/chunking/chunking_parser.py DESCRIPTION: Zerlegt Markdown in atomare Blöcke. Hält H1-Überschriften im Stream und gewährleistet die strukturelle Integrität von Callouts. """ import re from typing import List, Tuple, Set from .chunking_models import RawBlock from .chunking_utils import extract_frontmatter_from_text def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: """Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.""" blocks = [] h1_title = "Dokument"; section_path = "/"; current_section_title = None fm, text_without_fm = extract_frontmatter_from_text(md_text) # H1 für Metadaten extrahieren h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE) if h1_match: h1_title = h1_match.group(1).strip() lines = text_without_fm.split('\n') buffer = [] for line in lines: stripped = line.strip() heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped) if heading_match: # Vorherigen Text-Block abschließen if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title)) buffer = [] level = len(heading_match.group(1)) title = heading_match.group(2).strip() # Pfad- und Titel-Update if level == 1: current_section_title = title; section_path = "/" elif level == 2: current_section_title = title; section_path = f"/{current_section_title}" blocks.append(RawBlock("heading", stripped, level, section_path, current_section_title)) continue # Trenner oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts if (not stripped or stripped == "---") and not line.startswith('>'): if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title)) buffer = [] if stripped == "---": blocks.append(RawBlock("separator", "---", None, section_path, current_section_title)) else: buffer.append(line) if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title)) return blocks, h1_title def split_sentences(text: str) -> list[str]: """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion.""" text = re.sub(r'\s+', ' ', text.strip()) if not text: return [] # Splittet bei Satzzeichen, gefolgt von Leerzeichen und Großbuchstaben return [s.strip() for s in re.split(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])', text) if s.strip()]