WP15 Chunker

2025-12-12 08:57:25 +01:00 · 2025-12-12 08:57:25 +01:00 · 2bcf1930fe
commit 2bcf1930fe
parent 2d58220a3c
1 changed files with 50 additions and 21 deletions
--- a/app/core/chunker.py
+++ b/app/core/chunker.py
@ -11,7 +11,6 @@ import asyncio # Notwendig für asynchrone Chunking-Strategien

 # NEUE IMPORTS
 # Import der benötigten Klassen direkt (ersetzt get_semantic_analyzer)
-# ANNAHME: Die Klassen existieren in app/services/semantic_analyzer.py
 try:
    from app.services.semantic_analyzer import SemanticAnalyzer, SemanticChunkResult 
 except ImportError:
@ -24,13 +23,40 @@ except ImportError:
        content: str
        suggested_edges: List[str] # Format: "kind:Target"

-# Import zum Auslesen des Frontmatters
-# ANNAHME: extract_frontmatter_from_text existiert in app.core.note_payload
-from app.core.note_payload import extract_frontmatter_from_text 
+
+# ==========================================
+# 1. FUNKTION ZUM AUSLESEN DES FRONTMATTERS
+# ==========================================
+
+def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
+    """
+    Extrakte das YAML Frontmatter aus dem Markdown-Text und gibt den Body zurück.
+    (Lokalisiert im Chunker zur Vermeidung von Import-Zyklen/Fehlern)
+    """
+    # Regulärer Ausdruck, der den YAML-Block findet
+    fm_match = re.match(r'^---\s*\n(.*?)\n---', md_text, re.DOTALL)
+    
+    if not fm_match:
+        # Kein Frontmatter gefunden, gib leeres Dict und gesamten Text zurück
+        return {}, md_text
+
+    frontmatter_yaml = fm_match.group(1)
+    
+    try:
+        frontmatter = yaml.safe_load(frontmatter_yaml)
+        if not isinstance(frontmatter, dict):
+            frontmatter = {}
+    except yaml.YAMLError:
+        frontmatter = {}
+
+    # Entferne den Frontmatter Block aus dem Text
+    text_without_fm = re.sub(r'^---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
+    
+    return frontmatter, text_without_fm.strip()


 # ==========================================
-# 1. CONFIGURATION LOADER (Ehemals chunk_config.py)
+# 2. CONFIGURATION LOADER
 # ==========================================

 # Pfad-Logik: app/core/chunker.py -> app/core -> app -> root/config/types.yaml
@ -94,7 +120,7 @@ def get_sizes(note_type: str):
    }

 # ==========================================
-# 2. DATA CLASSES & HELPERS
+# 3. DATA CLASSES & HELPERS
 # ==========================================

 # --- Hilfen ---
@ -137,7 +163,6 @@ class Chunk:
 # --- Markdown Parser ---
 def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """Parst MD und gibt Blöcke UND den H1 Titel zurück."""
-    # Im echten Mindnet-System würde hier die komplexe Logik stehen.
    
    md = MarkdownIt("commonmark").enable("table")
    tokens: List[Token] = md.parse(md_text)
@ -147,14 +172,14 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    h2, h3 = None, None
    section_path = "/"
    
-    # Rudimentäres Block-Parsing für non-LLM Strategien (zur Wahrung der Struktur)
-    text_without_fm = re.sub(r'---.*?---', '', md_text, flags=re.DOTALL)
+    # Rudimentäres Block-Parsing für non-LLM Strategien
+    fm, text_without_fm = extract_frontmatter_from_text(md_text)
    
    if text_without_fm.strip():
         blocks.append(RawBlock(kind="paragraph", text=text_without_fm.strip(), 
                                level=None, section_path=section_path, section_title=h2))

-    # Realistischer wäre die Extraktion des H1 Titels hier
+    # H1 Titel Extraktion (für Context Injection in by_heading)
    h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
    if h1_match:
        h1_title = h1_match.group(1).strip()
@ -162,7 +187,7 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    return blocks, h1_title

 # ==========================================
-# 3. STRATEGIES (SYNCHRON)
+# 4. STRATEGIES (SYNCHRON)
 # ==========================================

 def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
@ -229,9 +254,6 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
    sections: Dict[str, List[RawBlock]] = {}
    ordered = []
    
-    # Anmerkung: Die ursprüngliche parse_blocks Logik zur H-Erkennung war detaillierter.
-    # Hier verwenden wir die rudimentäre RawBlock-Struktur.
-    
    for b in blocks:
        if b.kind == "heading": continue
        if b.section_path not in sections:
@ -267,7 +289,7 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
    return chunks

 # ==========================================
-# 4. STRATEGY (ASYNCHRON)
+# 5. STRATEGY (ASYNCHRON)
 # ==========================================

 # Singleton Instanz für den Analyzer
@ -320,7 +342,7 @@ async def _strategy_semantic_llm(md_text: str, config: Dict[str, Any], note_id:
    return chunks

 # ==========================================
-# 5. MAIN ENTRY POINT (ASYNC)
+# 6. MAIN ENTRY POINT (ASYNC)
 # ==========================================

 async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]:
@ -330,7 +352,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Ch
    """
    
    # 1. Frontmatter prüfen (Double-LLM-Prevention)
-    fm, _ = extract_frontmatter_from_text(md_text) 
+    fm, body = extract_frontmatter_from_text(md_text) 
    note_status = fm.get("status", "").lower()
    
    config = get_chunk_config(note_type)
@ -343,20 +365,27 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Ch
    if strategy == "semantic_llm" and note_status in ["draft", "initial_gen"]:
        # Setze auf die zweitbeste, aber synchrone und deterministische Strategie
        print(f"INFO: Overriding '{strategy}' for draft status. Using 'by_heading' instead.")
-        strategy = "by_heading" # Fallback auf by_heading, da LLM-Generatoren saubere H2-Strukturen nutzen.
+        strategy = "by_heading" 
    
    # 3. Execution (Dispatcher)
    
+    # Wir müssen den md_text neu zusammensetzen, falls der Body abgeschnitten wurde
+    if body:
+        md_to_chunk = md_text # Bei LLM oder By_Heading
+    else:
+        md_to_chunk = md_text
+
    if strategy == "semantic_llm":
-        chunks = await _strategy_semantic_llm(md_text, config, note_id, note_type)
+        # LLM-Strategie nutzt den gesamten MD-Text zur Orientierung
+        chunks = await _strategy_semantic_llm(md_to_chunk, config, note_id, note_type)
    
    elif strategy == "by_heading":
-        blocks, doc_title = parse_blocks(md_text)
+        blocks, doc_title = parse_blocks(md_to_chunk)
        # Synchronen Code in einem Thread ausführen
        chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
        
    else: # sliding_window (Default)
-        blocks, doc_title = parse_blocks(md_text)
+        blocks, doc_title = parse_blocks(md_to_chunk)
        # Synchronen Code in einem Thread ausführen
        chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id)