From 2bcf1930fe2d44b54237ca50efb3e059afd3d877 Mon Sep 17 00:00:00 2001
From: Lars <Lars@stommer.de>
Date: Fri, 12 Dec 2025 08:57:25 +0100
Subject: [PATCH] WP15 Chunker

---
 app/core/chunker.py | 71 +++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/app/core/chunker.py b/app/core/chunker.py
index 2c9fbcb..f521427 100644
--- a/app/core/chunker.py
+++ b/app/core/chunker.py
@@ -11,7 +11,6 @@ import asyncio # Notwendig für asynchrone Chunking-Strategien
 
 # NEUE IMPORTS
 # Import der benötigten Klassen direkt (ersetzt get_semantic_analyzer)
-# ANNAHME: Die Klassen existieren in app/services/semantic_analyzer.py
 try:
     from app.services.semantic_analyzer import SemanticAnalyzer, SemanticChunkResult 
 except ImportError:
@@ -24,13 +23,40 @@ except ImportError:
         content: str
         suggested_edges: List[str] # Format: "kind:Target"
 
-# Import zum Auslesen des Frontmatters
-# ANNAHME: extract_frontmatter_from_text existiert in app.core.note_payload
-from app.core.note_payload import extract_frontmatter_from_text 
+
+# ==========================================
+# 1. FUNKTION ZUM AUSLESEN DES FRONTMATTERS
+# ==========================================
+
+def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
+    """
+    Extrakte das YAML Frontmatter aus dem Markdown-Text und gibt den Body zurück.
+    (Lokalisiert im Chunker zur Vermeidung von Import-Zyklen/Fehlern)
+    """
+    # Regulärer Ausdruck, der den YAML-Block findet
+    fm_match = re.match(r'^---\s*\n(.*?)\n---', md_text, re.DOTALL)
+    
+    if not fm_match:
+        # Kein Frontmatter gefunden, gib leeres Dict und gesamten Text zurück
+        return {}, md_text
+
+    frontmatter_yaml = fm_match.group(1)
+    
+    try:
+        frontmatter = yaml.safe_load(frontmatter_yaml)
+        if not isinstance(frontmatter, dict):
+            frontmatter = {}
+    except yaml.YAMLError:
+        frontmatter = {}
+
+    # Entferne den Frontmatter Block aus dem Text
+    text_without_fm = re.sub(r'^---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
+    
+    return frontmatter, text_without_fm.strip()
 
 
 # ==========================================
-# 1. CONFIGURATION LOADER (Ehemals chunk_config.py)
+# 2. CONFIGURATION LOADER
 # ==========================================
 
 # Pfad-Logik: app/core/chunker.py -> app/core -> app -> root/config/types.yaml
@@ -94,7 +120,7 @@ def get_sizes(note_type: str):
     }
 
 # ==========================================
-# 2. DATA CLASSES & HELPERS
+# 3. DATA CLASSES & HELPERS
 # ==========================================
 
 # --- Hilfen ---
@@ -137,7 +163,6 @@ class Chunk:
 # --- Markdown Parser ---
 def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
     """Parst MD und gibt Blöcke UND den H1 Titel zurück."""
-    # Im echten Mindnet-System würde hier die komplexe Logik stehen.
     
     md = MarkdownIt("commonmark").enable("table")
     tokens: List[Token] = md.parse(md_text)
@@ -147,14 +172,14 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
     h2, h3 = None, None
     section_path = "/"
     
-    # Rudimentäres Block-Parsing für non-LLM Strategien (zur Wahrung der Struktur)
-    text_without_fm = re.sub(r'---.*?---', '', md_text, flags=re.DOTALL)
+    # Rudimentäres Block-Parsing für non-LLM Strategien
+    fm, text_without_fm = extract_frontmatter_from_text(md_text)
     
     if text_without_fm.strip():
          blocks.append(RawBlock(kind="paragraph", text=text_without_fm.strip(), 
                                 level=None, section_path=section_path, section_title=h2))
 
-    # Realistischer wäre die Extraktion des H1 Titels hier
+    # H1 Titel Extraktion (für Context Injection in by_heading)
     h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
     if h1_match:
         h1_title = h1_match.group(1).strip()
@@ -162,7 +187,7 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
     return blocks, h1_title
 
 # ==========================================
-# 3. STRATEGIES (SYNCHRON)
+# 4. STRATEGIES (SYNCHRON)
 # ==========================================
 
 def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
@@ -229,9 +254,6 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
     sections: Dict[str, List[RawBlock]] = {}
     ordered = []
     
-    # Anmerkung: Die ursprüngliche parse_blocks Logik zur H-Erkennung war detaillierter.
-    # Hier verwenden wir die rudimentäre RawBlock-Struktur.
-    
     for b in blocks:
         if b.kind == "heading": continue
         if b.section_path not in sections:
@@ -267,7 +289,7 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
     return chunks
 
 # ==========================================
-# 4. STRATEGY (ASYNCHRON)
+# 5. STRATEGY (ASYNCHRON)
 # ==========================================
 
 # Singleton Instanz für den Analyzer
@@ -320,7 +342,7 @@ async def _strategy_semantic_llm(md_text: str, config: Dict[str, Any], note_id:
     return chunks
 
 # ==========================================
-# 5. MAIN ENTRY POINT (ASYNC)
+# 6. MAIN ENTRY POINT (ASYNC)
 # ==========================================
 
 async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]:
@@ -330,7 +352,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Ch
     """
     
     # 1. Frontmatter prüfen (Double-LLM-Prevention)
-    fm, _ = extract_frontmatter_from_text(md_text) 
+    fm, body = extract_frontmatter_from_text(md_text) 
     note_status = fm.get("status", "").lower()
     
     config = get_chunk_config(note_type)
@@ -343,20 +365,27 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Ch
     if strategy == "semantic_llm" and note_status in ["draft", "initial_gen"]:
         # Setze auf die zweitbeste, aber synchrone und deterministische Strategie
         print(f"INFO: Overriding '{strategy}' for draft status. Using 'by_heading' instead.")
-        strategy = "by_heading" # Fallback auf by_heading, da LLM-Generatoren saubere H2-Strukturen nutzen.
+        strategy = "by_heading" 
     
     # 3. Execution (Dispatcher)
     
+    # Wir müssen den md_text neu zusammensetzen, falls der Body abgeschnitten wurde
+    if body:
+        md_to_chunk = md_text # Bei LLM oder By_Heading
+    else:
+        md_to_chunk = md_text
+
     if strategy == "semantic_llm":
-        chunks = await _strategy_semantic_llm(md_text, config, note_id, note_type)
+        # LLM-Strategie nutzt den gesamten MD-Text zur Orientierung
+        chunks = await _strategy_semantic_llm(md_to_chunk, config, note_id, note_type)
     
     elif strategy == "by_heading":
-        blocks, doc_title = parse_blocks(md_text)
+        blocks, doc_title = parse_blocks(md_to_chunk)
         # Synchronen Code in einem Thread ausführen
         chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
         
     else: # sliding_window (Default)
-        blocks, doc_title = parse_blocks(md_text)
+        blocks, doc_title = parse_blocks(md_to_chunk)
         # Synchronen Code in einem Thread ausführen
         chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id)