semantic semantic_analyzer angepasst

2025-12-23 18:17:34 +01:00 · 2025-12-23 18:17:34 +01:00 · f1bfa40b5b
commit f1bfa40b5b
parent dcc3083455
1 changed files with 22 additions and 33 deletions
--- a/app/services/semantic_analyzer.py
+++ b/app/services/semantic_analyzer.py
@ -1,10 +1,11 @@
 """
 FILE: app/services/semantic_analyzer.py
 DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
-VERSION: 2.1.0 (Fix: Strict Edge String Validation against LLM Hallucinations)
+             WP-20 Fix: Kompatibilität mit Provider-basierten Prompt-Dictionaries (Hybrid-Modus).
+VERSION: 2.2.0
 STATUS: Active
 DEPENDENCIES: app.services.llm_service, json, logging
-LAST_ANALYSIS: 2025-12-16
+LAST_ANALYSIS: 2025-12-23
 """

 import json
@ -24,7 +25,7 @@ class SemanticAnalyzer:
    def _is_valid_edge_string(self, edge_str: str) -> bool:
        """
        Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
-        Verhindert, dass LLM-Geschwätz ("Here is the list: ...") als Kante durchrutscht.
+        Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
        """
        if not isinstance(edge_str, str) or ":" not in edge_str:
            return False
@ -34,8 +35,6 @@ class SemanticAnalyzer:
        target = parts[1].strip()
        
        # Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
-        # Erlaubt: "derived_from", "related_to"
-        # Verboten: "derived end of instruction", "Here is the list"
        if " " in kind:
            return False
            
@ -54,19 +53,16 @@ class SemanticAnalyzer:
        Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
        Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
        
-        Features:
-        - Retry Strategy: Wartet bei Überlastung (max_retries=5).
-        - Priority Queue: Läuft als "background" Task, um den Chat nicht zu blockieren.
-        - Observability: Loggt Input-Größe, Raw-Response und Parsing-Details.
+        WP-20 Fix: Nutzt get_prompt(), um den 'AttributeError: dict object' zu vermeiden.
        """
        if not all_edges:
            return []

-        # 1. Prompt laden
-        prompt_template = self.llm.prompts.get("edge_allocation_template")
+        # 1. Prompt laden via get_prompt (handelt die Provider-Kaskade automatisch ab) [WP-20 Fix]
+        prompt_template = self.llm.get_prompt("edge_allocation_template")
        
-        if not prompt_template:
-            logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' fehlt. Nutze Fallback.")
+        if not prompt_template or isinstance(prompt_template, dict):
+            logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' konnte nicht als String geladen werden. Nutze Hard-Fallback.")
            prompt_template = (
                "TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
                "TEXT: {chunk_text}\n"
@ -80,14 +76,18 @@ class SemanticAnalyzer:
        # LOG: Request Info
        logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")

-        # 3. Prompt füllen
-        final_prompt = prompt_template.format(
-            chunk_text=chunk_text[:3500], 
-            edge_list=edges_str
-        )
+        # 3. Prompt füllen (Hier trat der AttributeError auf, wenn prompt_template ein dict war)
+        try:
+            final_prompt = prompt_template.format(
+                chunk_text=chunk_text[:3500], 
+                edge_list=edges_str
+            )
+        except Exception as format_err:
+            logger.error(f"❌ [SemanticAnalyzer] Format Error im Prompt-Template: {format_err}")
+            return []

        try:
-            # 4. LLM Call mit Traffic Control
+            # 4. LLM Call mit Traffic Control (Background Priority)
            response_json = await self.llm.generate_raw_response(
                prompt=final_prompt,
                force_json=True,
@ -103,39 +103,30 @@ class SemanticAnalyzer:
            clean_json = response_json.replace("```json", "").replace("```", "").strip()
            
            if not clean_json: 
-                logger.warning("⚠️ [SemanticAnalyzer] Leere Antwort vom LLM erhalten. Trigger Fallback.")
+                logger.warning("⚠️ [SemanticAnalyzer] Leere Antwort vom LLM erhalten.")
                return []

            try:
                data = json.loads(clean_json)
            except json.JSONDecodeError as json_err:
-                logger.error(f"❌ [SemanticAnalyzer] JSON Decode Error.")
-                logger.error(f"   Grund: {json_err}")
-                logger.error(f"   Empfangener String: {clean_json[:500]}")
-                logger.info("   -> Workaround: Fallback auf 'Alle Kanten' (durch Chunker).")
+                logger.error(f"❌ [SemanticAnalyzer] JSON Decode Error: {json_err}")
                return []

            valid_edges = []

            # 6. Robuste Validierung (List vs Dict)
-            # Wir sammeln erst alle Strings ein
            raw_candidates = []
            
            if isinstance(data, list):
                raw_candidates = data
            
            elif isinstance(data, dict):
-                logger.info(f"ℹ️ [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur. Keys: {list(data.keys())}")
+                logger.info(f"ℹ️ [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur.")
                for key, val in data.items():
-                    # Fall A: {"edges": ["kind:target"]}
                    if key.lower() in ["edges", "results", "kanten", "matches"] and isinstance(val, list):
                         raw_candidates.extend(val)
-                    
-                    # Fall B: {"kind": "target"} (Beziehung als Key)
                    elif isinstance(val, str):
                        raw_candidates.append(f"{key}:{val}")
-                    
-                    # Fall C: {"kind": ["target1", "target2"]}
                    elif isinstance(val, list):
                        for target in val:
                            if isinstance(target, str):
@ -149,10 +140,8 @@ class SemanticAnalyzer:
                else:
                    logger.debug(f"   [SemanticAnalyzer] Invalid edge format rejected: '{e_str}'")

-            # Safety: Filtere nur Kanten, die halbwegs valide aussehen (Doppelcheck)
            final_result = [e for e in valid_edges if ":" in e]
            
-            # LOG: Ergebnis
            if final_result:
                logger.info(f"✅ [SemanticAnalyzer] Success. {len(final_result)} Kanten zugewiesen.")
            else: