chunker mit LLM initial

2025-12-11 23:21:24 +01:00 · 2025-12-11 23:21:24 +01:00 · 13ab2b7d68
commit 13ab2b7d68
parent 714763f92f
3 changed files with 197 additions and 34 deletions
--- a/app/core/chunker.py
+++ b/app/core/chunker.py
@ -8,15 +8,17 @@ from pathlib import Path
 from markdown_it import MarkdownIt
 from markdown_it.token import Token
 # NEUE IMPORTS
 # Import des Semantic Analyzer Services
 from app.services.semantic_analyzer import get_semantic_analyzer
 import asyncio # Für den asynchronen Aufruf des Chunkers
 # ==========================================
-# 1. CONFIGURATION LOADER (Updated for config/ dir)
+# 1. CONFIGURATION LOADER (Ehemals chunk_config.py)
 # ==========================================
-# Pfad-Logik:
+# Pfad-Logik: app/core/chunker.py -> app/core -> app -> root/config/types.yaml
 # Wir gehen 3 Ebenen hoch: app/core/chunker.py -> app/core -> app -> root
 BASE_DIR = Path(__file__).resolve().parent.parent.parent
 # KORREKTUR: types.yaml liegt im Unterordner "config"
 CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
 # Fallback Values
@ -38,7 +40,6 @@ def _load_yaml_config() -> Dict[str, Any]:
    if not CONFIG_PATH.exists():
        # Debugging-Hilfe: Zeigt an, wo gesucht wurde
        print(f"WARNUNG: types.yaml nicht gefunden unter: {CONFIG_PATH}")
        print(f"         (Basis-Verzeichnis war: {BASE_DIR})")
        return {}
    try:
@ -51,32 +52,24 @@ def _load_yaml_config() -> Dict[str, Any]:
        return {}
 def get_chunk_config(note_type: str) -> Dict[str, Any]:
-    """
+    """Löst Typ -> Profil -> Konfiguration auf."""
    Löst Typ -> Profil -> Konfiguration auf.
    """
    full_config = _load_yaml_config()
    # 1. Profile holen
    profiles = full_config.get("chunking_profiles", {})
    # 2. Typ-Definition holen
    type_def = full_config.get("types", {}).get(note_type.lower(), {})
    # 3. Profil-Namen ermitteln (Fallback auf defaults)
    profile_name = type_def.get("chunking_profile")
    if not profile_name:
        profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
    # 4. Config bauen
    config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
    # Sicherstellen, dass Overlap ein Tuple ist
    if "overlap" in config and isinstance(config["overlap"], list):
        config["overlap"] = tuple(config["overlap"])
    return config
-# Legacy Support für alten Code
+# Legacy Support
 def get_sizes(note_type: str):
    cfg = get_chunk_config(note_type)
    return {
@ -86,7 +79,7 @@ def get_sizes(note_type: str):
    }
 # ==========================================
-# 2. CHUNKING LOGIC & PARSER
+# 2. DATA CLASSES & HELPERS
 # ==========================================
 # --- Hilfen ---
@ -94,7 +87,6 @@ _SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
 _WS = re.compile(r'\s+')
 def estimate_tokens(text: str) -> int:
    # 1 Token ≈ 4 chars 
    t = len(text.strip())
    return max(1, math.ceil(t / 4))
@ -117,8 +109,8 @@ class Chunk:
    id: str
    note_id: str
    index: int
-    text: str           # Reintext für Anzeige
+    text: str           # Reintext für Anzeige (JETZT INKL. INJIZIERTER LINKS)
-    window: str         # Text + Context für Embeddings
+    window: str         # Text + Context für Embeddings (WIE 'text' BEI LLM-CHUNK)
    token_count: int
    section_title: Optional[str]
    section_path: str
@ -193,7 +185,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
        i += 1
    return blocks, h1_title
-# --- Strategien ---
+# ==========================================
 # 3. STRATEGIES (SYNCHRON)
 # ==========================================
 def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
    target = config.get("target", 400)
@ -266,6 +260,8 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
    for path in ordered:
        s_blocks = sections[path]
        if not s_blocks: continue
        breadcrumbs = path.strip("/").replace("/", " > ")
        context_header = f"# {doc_title}\n## {breadcrumbs}"
        full_text = "\n\n".join([b.text for b in s_blocks])
@ -279,6 +275,7 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
                neighbors_prev=None, neighbors_next=None, char_start=0, char_end=0
            ))
        else:
            # Fallback auf Sliding Window mit Context Injection
            sub = _strategy_sliding_window(s_blocks, config, note_id, context_prefix=context_header)
            base = len(chunks)
            for i, sc in enumerate(sub):
@ -287,19 +284,79 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
                chunks.append(sc)
    return chunks
-# --- Main Entry Point ---
+# ==========================================
 # 4. STRATEGY (ASYNCHRON)
 # ==========================================
-def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]:
+async def _strategy_semantic_llm(md_text: str, config: Dict[str, Any], note_id: str, note_type: str) -> List[Chunk]:
    """
    NEUE STRATEGIE: Delegiert die Zerlegung und Kanten-Extraktion an ein LLM.
    """
    analyzer = get_semantic_analyzer()
    # Text-Splitting wird hier vom LLM übernommen
    semantic_chunks = await analyzer.analyze_and_chunk(md_text, note_type)
    chunks: List[Chunk] = []
    for i, sc in enumerate(semantic_chunks):
        # 1. Edge Injection für derive_edges.py
        # Wir formatieren die LLM-generierten Kanten in die Inline-Syntax, 
        # damit die bestehende derive_edges.py (Regex) sie findet.
        injection_block = "\n"
        for edge_str in sc.suggested_edges:
            kind, target = edge_str.split(":", 1)
            # Nutzt die Syntax: [[rel:kind | Target]]
            injection_block += f"[[rel:{kind} | {target}]] "
        full_text = sc.content + injection_block
        # 2. Chunk Objekt bauen
        chunks.append(Chunk(
            id=f"{note_id}#sem{i:02d}",
            note_id=note_id,
            index=i,
            text=full_text.strip(),     # Enthält die Links (für derive_edges)
            window=full_text.strip(),   # Auch das Embedding "sieht" die Links (gut für Retrieval)
            token_count=estimate_tokens(full_text),
            section_title="Semantic Section",
            section_path="/LLM",
            neighbors_prev=None, neighbors_next=None,
            char_start=0, char_end=0
        ))
    return chunks
 # ==========================================
 # 5. MAIN ENTRY POINT (ASYNC)
 # ==========================================
 async def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]:
    """
    Hauptfunktion. Analysiert Config und wählt Strategie. MUSS ASYNC SEIN.
    """
    config = get_chunk_config(note_type)
    strategy = config.get("strategy", "sliding_window")
    blocks, doc_title = parse_blocks(md_text)
-    if strategy == "by_heading":
+    # Die beiden bestehenden Strategien rufen wir über einen Sync-Wrapper auf, 
-        chunks = _strategy_by_heading(blocks, config, note_id, doc_title)
+    # damit assemble_chunks ASYNC bleiben kann.
-    else:
+    if strategy == "semantic_llm":
-        chunks = _strategy_sliding_window(blocks, config, note_id)
+        chunks = await _strategy_semantic_llm(md_text, config, note_id, note_type)
    elif strategy == "by_heading":
        blocks, doc_title = parse_blocks(md_text)
        # Blockiert nur kurz für die sync-Rechenarbeit
        chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
    else: # sliding_window (Default)
        blocks, doc_title = parse_blocks(md_text)
        # Blockiert nur kurz für die sync-Rechenarbeit
        chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id)
    # Post-Process: Neighbors setzen
    for i, ch in enumerate(chunks):
        ch.neighbors_prev = chunks[i-1].id if i > 0 else None
        ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
    return chunks
--- a/app/services/semantic_analyzer.py
+++ b/app/services/semantic_analyzer.py
@ -0,0 +1,98 @@
 """
 app/services/semantic_analyzer.py
 Kapselt die LLM-Strategie für Chunking und Kanten-Extraktion.
 Nutzt die Matrix-Logik aus DiscoveryService für konsistente Kanten-Typen.
 """
 import json
 import logging
 import re
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass
 from app.services.llm_service import LLMService
 from app.services.discovery import DiscoveryService
 logger = logging.getLogger(__name__)
@dataclass
 class SemanticChunkResult:
    content: str
    suggested_edges: List[str] # Format: "kind:Target"
 class SemanticAnalyzer:
    def __init__(self):
        self.llm = LLMService()
        self.discovery = DiscoveryService() # Wiederverwendung der Matrix-Logik
    async def analyze_and_chunk(self, text: str, source_type: str) -> List[SemanticChunkResult]:
        """
        Zerlegt Text mittels LLM in semantische Abschnitte und extrahiert Kanten.
        """
        # 1. Prompt bauen
        system_prompt = (
            "Du bist ein Knowledge Graph Experte. Deine Aufgabe ist es, Rohtext in "
            "thematisch geschlossene Abschnitte (Chunks) zu zerlegen.\n"
            "Analysiere jeden Abschnitt auf Beziehungen zu anderen Konzepten.\n"
            "Antworte AUSSCHLIESSLICH mit validem JSON in diesem Format:\n"
            "[\n"
            "  {\n"
            "    \"content\": \"Der Text des Abschnitts...\",\n"
            "    \"relations\": [{\"target\": \"Qdrant\", \"type\": \"depends_on\"}]\n"
            "  }\n"
            "]\n"
            "Halte die Chunks mittellang (ca. 100-300 Wörter). Verändere den Inhalt nicht, nur die Struktur."
        )
        user_prompt = f"Dokument-Typ: {source_type}\n\nTEXT:\n{text}"
        try:
            # 2. LLM Call
            response_json = await self.llm.generate_raw_response(user_prompt, system=system_prompt)
            # 3. JSON Parsing & Validierung
            # Markdown Code-Block entfernen falls vorhanden
            clean_json = response_json.replace("```json", "").replace("```", "").strip()
            data = json.loads(clean_json)
            results = []
            for item in data:
                content = item.get("content", "").strip()
                if not content: continue
                raw_rels = item.get("relations", [])
                refined_edges = []
                for rel in raw_rels:
                    target = rel.get("target")
                    raw_type = rel.get("type", "related_to")
                    if target:
                        # 4. Matrix-Logik anwenden (Active Intelligence)
                        # Wir versuchen, den Typ des Ziels zu erraten oder nutzen Matrix blind
                        # Hier vereinfacht: Wir nutzen Discovery Logic um den Edge-Typ zu validieren
                        # (Wir nehmen an, Target Type ist unbekannt -> 'concept')
                        final_kind = self.discovery._resolve_edge_type(source_type, "concept")
                        # Wenn LLM spezifischer war (z.B. 'blocks'), nehmen wir das LLM,
                        # sonst den Matrix-Vorschlag
                        if raw_type in ["related_to", "link"] and final_kind != "related_to":
                            edge_str = f"{final_kind}:{target}"
                        else:
                            edge_str = f"{raw_type}:{target}"
                        refined_edges.append(edge_str)
                results.append(SemanticChunkResult(content=content, suggested_edges=refined_edges))
            return results
        except json.JSONDecodeError:
            logger.warning("SemanticAnalyzer: LLM lieferte kein valides JSON. Fallback auf Raw Text.")
            return [SemanticChunkResult(content=text, suggested_edges=[])]
        except Exception as e:
            logger.error(f"SemanticAnalyzer Error: {e}")
            return [SemanticChunkResult(content=text, suggested_edges=[])]
    async def close(self):
        await self.llm.close()
--- a/config/types.yaml
+++ b/config/types.yaml
@ -29,7 +29,14 @@ chunking_profiles:
    max: 600              # Fallback Limit
    target: 400           # Fallback Target bei Sub-Chunking
    overlap: [50, 80]     # Overlap bei Sub-Chunking
-
+    
  # NEU: LLM-basierte semantische Zerlegung (Chunker.py ruft semantic_analyzer.py)
  semantic_llm:
    strategy: semantic_llm
    # Da das LLM die Längensteuerung übernimmt, dienen diese als Fallback/Empfehlung
    target: 400 
    max: 800 
 defaults:
  retriever_weight: 1.0
  chunking_profile: sliding_standard # Fallback Profil
@ -54,7 +61,7 @@ types:
  # --- IDENTITÄT & PERSÖNLICHKEIT ---
  profile:
-    chunking_profile: structured_strict # H2 Split wichtig für Profile
+    chunking_profile: structured_strict 
    retriever_weight: 0.70
    edge_defaults: ["references", "related_to"]
@ -85,7 +92,7 @@ types:
    edge_defaults: ["depends_on", "related_to"]
  decision:
-    chunking_profile: structured_strict # ADRs sind oft strukturiert
+    chunking_profile: structured_strict 
    retriever_weight: 1.00
    edge_defaults: ["caused_by", "references"]
@ -101,7 +108,7 @@ types:
  # --- OPERATIV ---
  project:
-    chunking_profile: sliding_large # Projekte haben viel Text
+    chunking_profile: sliding_large 
    retriever_weight: 0.97
    edge_defaults: ["references", "depends_on"] 
@ -111,6 +118,7 @@ types:
    edge_defaults: ["depends_on", "part_of"]
  journal:
-    chunking_profile: sliding_standard
+    # NEUE ZUWEISUNG: Journale profitieren am meisten von der semantischen Analyse
    chunking_profile: semantic_llm 
    retriever_weight: 0.80
    edge_defaults: ["references", "related_to"]